mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 18:14:51 +00:00 
			
		
		
		
	rfctr(part): remove double-decoration 3 (#3687)
**Summary** Install new `@apply_metadata()` on HTML and remove decorators from delegating partitioners EPUB, MD, ORG, RST, and RTF. **Additional Context** - All five of these delegating partitioners delegate to `partition_html()` so they're something of a matched set. EML and MSG also partially delegate to HTML but that's a harder problem (they also delegate to all other partitioners for attachments) that we'll address a couple PRs later . - Replace use of `@process_metadata()` and `@add_metadata_with_filetype()` decorators with `@apply_metadata()` on `partition_html()`. - Remove all decorators from delegating partitioners; this removes the "double-decorating".
This commit is contained in:
		
							parent
							
								
									17092198d0
								
							
						
					
					
						commit
						9bd91a836e
					
				| @ -1,4 +1,4 @@ | |||||||
| ## 0.15.14-dev8 | ## 0.15.14-dev9 | ||||||
| 
 | 
 | ||||||
| ### Enhancements | ### Enhancements | ||||||
| 
 | 
 | ||||||
| @ -14,6 +14,7 @@ | |||||||
| * **Allow empty text files.** Fixes an issue where text files with only white space would fail to be partitioned. | * **Allow empty text files.** Fixes an issue where text files with only white space would fail to be partitioned. | ||||||
| * **Remove double-decoration for CSV, DOC, ODT partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (CSV and DOCX in this case); remove decoration from delegating partitioners. | * **Remove double-decoration for CSV, DOC, ODT partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (CSV and DOCX in this case); remove decoration from delegating partitioners. | ||||||
| * **Remove double-decoration for PPT, PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners. | * **Remove double-decoration for PPT, PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners. | ||||||
|  | * **Remove double-decoration for HTML, EPUB, MD, ORG, RST, and RTF partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (HTML in this case); remove decoration from delegating partitioners. | ||||||
| 
 | 
 | ||||||
| ## 0.15.13 | ## 0.15.13 | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										
											BIN
										
									
								
								example-docs/simple.epub
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								example-docs/simple.epub
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| @ -1200,7 +1200,6 @@ def opts_args() -> dict[str, Any]: | |||||||
|         "url": None, |         "url": None, | ||||||
|         "headers": {}, |         "headers": {}, | ||||||
|         "ssl_verify": True, |         "ssl_verify": True, | ||||||
|         "metadata_last_modified": None, |  | ||||||
|         "skip_headers_and_footers": False, |         "skip_headers_and_footers": False, | ||||||
|         "detection_origin": None, |         "detection_origin": None, | ||||||
|     } |     } | ||||||
| @ -1301,15 +1300,7 @@ class DescribeHtmlPartitionerOptions: | |||||||
| 
 | 
 | ||||||
|     # -- .last_modified -------------------------- |     # -- .last_modified -------------------------- | ||||||
| 
 | 
 | ||||||
|     def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided( |     def it_gets_last_modified_from_the_filesystem_when_file_path_is_provided( | ||||||
|         self, opts_args: dict[str, Any] |  | ||||||
|     ): |  | ||||||
|         opts_args["metadata_last_modified"] = "2024-03-05T17:02:53" |  | ||||||
|         opts = HtmlPartitionerOptions(**opts_args) |  | ||||||
| 
 |  | ||||||
|         assert opts.last_modified == "2024-03-05T17:02:53" |  | ||||||
| 
 |  | ||||||
|     def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided( |  | ||||||
|         self, opts_args: dict[str, Any], get_last_modified_date_: Mock |         self, opts_args: dict[str, Any], get_last_modified_date_: Mock | ||||||
|     ): |     ): | ||||||
|         opts_args["file_path"] = "a/b/document.html" |         opts_args["file_path"] = "a/b/document.html" | ||||||
|  | |||||||
| @ -10,12 +10,11 @@ from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_ME | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_epub_from_filename(): | def test_partition_epub_from_filename(): | ||||||
|     filename = example_doc_path("winter-sports.epub") |     elements = partition_epub(example_doc_path("simple.epub")) | ||||||
|     elements = partition_epub(filename=filename) | 
 | ||||||
|     assert len(elements) > 0 |     assert len(elements) > 0 | ||||||
|     assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") |     assert isinstance(elements[0], Text) | ||||||
|     for element in elements: |     assert elements[0].text.startswith("a shared culture") | ||||||
|         assert element.metadata.filename == "winter-sports.epub" |  | ||||||
|     if UNSTRUCTURED_INCLUDE_DEBUG_METADATA: |     if UNSTRUCTURED_INCLUDE_DEBUG_METADATA: | ||||||
|         assert {element.metadata.detection_origin for element in elements} == {"epub"} |         assert {element.metadata.detection_origin for element in elements} == {"epub"} | ||||||
| 
 | 
 | ||||||
| @ -28,37 +27,56 @@ def test_partition_epub_from_filename_returns_table_in_elements(): | |||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_epub_from_filename_returns_uns_elements(): |  | ||||||
|     filename = example_doc_path("winter-sports.epub") |  | ||||||
|     elements = partition_epub(filename=filename) |  | ||||||
|     assert len(elements) > 0 |  | ||||||
|     assert isinstance(elements[0], Text) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_partition_epub_from_filename_with_metadata_filename(): |  | ||||||
|     filename = example_doc_path("winter-sports.epub") |  | ||||||
|     elements = partition_epub(filename=filename, metadata_filename="test") |  | ||||||
|     assert len(elements) > 0 |  | ||||||
|     assert all(element.metadata.filename == "test" for element in elements) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_partition_epub_from_file(): | def test_partition_epub_from_file(): | ||||||
|     filename = example_doc_path("winter-sports.epub") |     with open(example_doc_path("winter-sports.epub"), "rb") as f: | ||||||
|     with open(filename, "rb") as f: |  | ||||||
|         elements = partition_epub(file=f) |         elements = partition_epub(file=f) | ||||||
|  | 
 | ||||||
|     assert len(elements) > 0 |     assert len(elements) > 0 | ||||||
|     assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") |     assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") | ||||||
|     for element in elements: |  | ||||||
|         assert element.metadata.filename is None |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_epub_from_file_with_metadata_filename(): | # -- .metadata.filename -------------------------------------------------------------------------- | ||||||
|     filename = example_doc_path("winter-sports.epub") | 
 | ||||||
|     with open(filename, "rb") as f: | 
 | ||||||
|         elements = partition_epub(file=f, metadata_filename="test") | def test_partition_epub_from_filename_gets_filename_from_filename_arg(): | ||||||
|  |     elements = partition_epub(example_doc_path("simple.epub")) | ||||||
|  | 
 | ||||||
|     assert len(elements) > 0 |     assert len(elements) > 0 | ||||||
|     for element in elements: |     assert all(e.metadata.filename == "simple.epub" for e in elements) | ||||||
|         assert element.metadata.filename == "test" | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_epub_from_file_gets_filename_None(): | ||||||
|  |     with open(example_doc_path("simple.epub"), "rb") as f: | ||||||
|  |         elements = partition_epub(file=f) | ||||||
|  | 
 | ||||||
|  |     assert len(elements) > 0 | ||||||
|  |     assert all(e.metadata.filename is None for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_epub_from_filename_prefers_metadata_filename(): | ||||||
|  |     elements = partition_epub(example_doc_path("simple.epub"), metadata_filename="orig-name.epub") | ||||||
|  | 
 | ||||||
|  |     assert len(elements) > 0 | ||||||
|  |     assert all(element.metadata.filename == "orig-name.epub" for element in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_epub_from_file_prefers_metadata_filename(): | ||||||
|  |     with open(example_doc_path("simple.epub"), "rb") as f: | ||||||
|  |         elements = partition_epub(file=f, metadata_filename="orig-name.epub") | ||||||
|  | 
 | ||||||
|  |     assert all(e.metadata.filename == "orig-name.epub" for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # -- .metadata.filetype -------------------------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_epub_gets_the_EPUB_MIME_type_in_metadata_filetype(): | ||||||
|  |     EPUB_MIME_TYPE = "application/epub" | ||||||
|  |     elements = partition_epub(example_doc_path("simple.epub")) | ||||||
|  |     assert all(e.metadata.filetype == EPUB_MIME_TYPE for e in elements), ( | ||||||
|  |         f"Expected all elements to have '{EPUB_MIME_TYPE}' as their filetype, but got:" | ||||||
|  |         f" {repr(elements[0].metadata.filetype)}" | ||||||
|  |     ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # -- .metadata.last_modified --------------------------------------------------------------------- | # -- .metadata.last_modified --------------------------------------------------------------------- | ||||||
| @ -72,10 +90,17 @@ def test_partition_epub_from_file_path_gets_last_modified_from_filesystem(mocker | |||||||
| 
 | 
 | ||||||
|     elements = partition_epub(example_doc_path("winter-sports.epub")) |     elements = partition_epub(example_doc_path("winter-sports.epub")) | ||||||
| 
 | 
 | ||||||
|     assert elements[0].metadata.last_modified == filesystem_last_modified |     assert all(e.metadata.last_modified == filesystem_last_modified for e in elements) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_xml_from_file_path_prefers_metadata_last_modified(mocker: MockFixture): | def test_partition_epub_from_file_gets_last_modified_None(): | ||||||
|  |     with open(example_doc_path("simple.epub"), "rb") as f: | ||||||
|  |         elements = partition_epub(file=f) | ||||||
|  | 
 | ||||||
|  |     assert all(e.metadata.last_modified is None for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_epub_from_file_path_prefers_metadata_last_modified(mocker: MockFixture): | ||||||
|     filesystem_last_modified = "2024-06-14T16:01:29" |     filesystem_last_modified = "2024-06-14T16:01:29" | ||||||
|     metadata_last_modified = "2020-03-08T06:10:23" |     metadata_last_modified = "2020-03-08T06:10:23" | ||||||
|     mocker.patch( |     mocker.patch( | ||||||
| @ -89,6 +114,14 @@ def test_partition_xml_from_file_path_prefers_metadata_last_modified(mocker: Moc | |||||||
|     assert all(e.metadata.last_modified == metadata_last_modified for e in elements) |     assert all(e.metadata.last_modified == metadata_last_modified for e in elements) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_partition_epub_from_file_prefers_metadata_last_modified(): | ||||||
|  |     metadata_last_modified = "2020-03-08T06:10:23" | ||||||
|  |     with open(example_doc_path("simple.epub"), "rb") as f: | ||||||
|  |         elements = partition_epub(file=f, metadata_last_modified=metadata_last_modified) | ||||||
|  | 
 | ||||||
|  |     assert all(e.metadata.last_modified is metadata_last_modified for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| # ------------------------------------------------------------------------------------------------ | # ------------------------------------------------------------------------------------------------ | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -17,55 +17,29 @@ from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_ME | |||||||
| def test_partition_md_from_filename(): | def test_partition_md_from_filename(): | ||||||
|     filename = example_doc_path("README.md") |     filename = example_doc_path("README.md") | ||||||
|     elements = partition_md(filename=filename) |     elements = partition_md(filename=filename) | ||||||
|     assert "PageBreak" not in [elem.category for elem in elements] | 
 | ||||||
|     assert len(elements) > 0 |     assert len(elements) > 0 | ||||||
|     for element in elements: |     assert "PageBreak" not in [elem.category for elem in elements] | ||||||
|         assert element.metadata.filename == "README.md" |     assert isinstance(elements[0], Title) | ||||||
|     if UNSTRUCTURED_INCLUDE_DEBUG_METADATA: |     if UNSTRUCTURED_INCLUDE_DEBUG_METADATA: | ||||||
|         assert {element.metadata.detection_origin for element in elements} == {"md"} |         assert {element.metadata.detection_origin for element in elements} == {"md"} | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_md_from_filename_returns_uns_elements(): |  | ||||||
|     filename = example_doc_path("README.md") |  | ||||||
|     elements = partition_md(filename=filename) |  | ||||||
|     assert len(elements) > 0 |  | ||||||
|     assert isinstance(elements[0], Title) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_partition_md_from_filename_with_metadata_filename(): |  | ||||||
|     filename = example_doc_path("README.md") |  | ||||||
|     elements = partition_md(filename=filename, metadata_filename="test") |  | ||||||
|     assert "PageBreak" not in [elem.category for elem in elements] |  | ||||||
|     assert len(elements) > 0 |  | ||||||
|     for element in elements: |  | ||||||
|         assert element.metadata.filename == "test" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_partition_md_from_file(): | def test_partition_md_from_file(): | ||||||
|     filename = example_doc_path("README.md") |     filename = example_doc_path("README.md") | ||||||
|     with open(filename, "rb") as f: |     with open(filename, "rb") as f: | ||||||
|         elements = partition_md(file=f) |         elements = partition_md(file=f) | ||||||
|     assert len(elements) > 0 |     assert len(elements) > 0 | ||||||
|     for element in elements: |  | ||||||
|         assert element.metadata.filename is None |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_partition_md_from_file_with_metadata_filename(): |  | ||||||
|     filename = example_doc_path("README.md") |  | ||||||
|     with open(filename, "rb") as f: |  | ||||||
|         elements = partition_md(file=f, metadata_filename="test") |  | ||||||
|     assert len(elements) > 0 |  | ||||||
|     assert all(element.metadata.filename == "test" for element in elements) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_md_from_text(): | def test_partition_md_from_text(): | ||||||
|     filename = example_doc_path("README.md") |     with open(example_doc_path("README.md")) as f: | ||||||
|     with open(filename) as f: |  | ||||||
|         text = f.read() |         text = f.read() | ||||||
|  | 
 | ||||||
|     elements = partition_md(text=text) |     elements = partition_md(text=text) | ||||||
|  | 
 | ||||||
|     assert len(elements) > 0 |     assert len(elements) > 0 | ||||||
|     for element in elements: |     assert all(e.metadata.filename is None for e in elements) | ||||||
|         assert element.metadata.filename is None |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class MockResponse: | class MockResponse: | ||||||
| @ -90,8 +64,7 @@ def test_partition_md_from_url(): | |||||||
|         elements = partition_md(url="https://fake.url") |         elements = partition_md(url="https://fake.url") | ||||||
| 
 | 
 | ||||||
|     assert len(elements) > 0 |     assert len(elements) > 0 | ||||||
|     for element in elements: |     assert all(e.metadata.filename is None for e in elements) | ||||||
|         assert element.metadata.filename is None |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_md_from_url_raises_with_bad_status_code(): | def test_partition_md_from_url_raises_with_bad_status_code(): | ||||||
| @ -136,6 +109,50 @@ def test_partition_md_raises_with_too_many_specified(): | |||||||
|         partition_md(filename=filename, text=text) |         partition_md(filename=filename, text=text) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | # -- .metadata.filename -------------------------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_md_from_filename_gets_filename_from_filename_arg(): | ||||||
|  |     elements = partition_md(example_doc_path("README.md")) | ||||||
|  | 
 | ||||||
|  |     assert len(elements) > 0 | ||||||
|  |     assert all(e.metadata.filename == "README.md" for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_md_from_file_gets_filename_None(): | ||||||
|  |     with open(example_doc_path("README.md"), "rb") as f: | ||||||
|  |         elements = partition_md(file=f) | ||||||
|  | 
 | ||||||
|  |     assert len(elements) > 0 | ||||||
|  |     assert all(e.metadata.filename is None for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_md_from_filename_prefers_metadata_filename(): | ||||||
|  |     elements = partition_md(example_doc_path("README.md"), metadata_filename="orig-name.md") | ||||||
|  | 
 | ||||||
|  |     assert len(elements) > 0 | ||||||
|  |     assert all(element.metadata.filename == "orig-name.md" for element in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_md_from_file_prefers_metadata_filename(): | ||||||
|  |     with open(example_doc_path("README.md"), "rb") as f: | ||||||
|  |         elements = partition_md(file=f, metadata_filename="orig-name.md") | ||||||
|  | 
 | ||||||
|  |     assert all(e.metadata.filename == "orig-name.md" for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # -- .metadata.filetype -------------------------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_md_gets_the_MD_MIME_type_in_metadata_filetype(): | ||||||
|  |     MD_MIME_TYPE = "text/markdown" | ||||||
|  |     elements = partition_md(example_doc_path("README.md")) | ||||||
|  |     assert all(e.metadata.filetype == MD_MIME_TYPE for e in elements), ( | ||||||
|  |         f"Expected all elements to have '{MD_MIME_TYPE}' as their filetype, but got:" | ||||||
|  |         f" {repr(elements[0].metadata.filetype)}" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| # -- .metadata.last_modified --------------------------------------------------------------------- | # -- .metadata.last_modified --------------------------------------------------------------------- | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -15,30 +15,61 @@ def test_partition_org_from_filename(): | |||||||
|     assert elements[0].metadata.filetype == "text/org" |     assert elements[0].metadata.filetype == "text/org" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_org_from_filename_with_metadata_filename(): |  | ||||||
|     elements = partition_org(example_doc_path("README.org"), metadata_filename="test") |  | ||||||
| 
 |  | ||||||
|     assert elements[0] == Title("Example Docs") |  | ||||||
|     assert elements[0].metadata.filename == "test" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_partition_org_from_file(): | def test_partition_org_from_file(): | ||||||
|     with open(example_doc_path("README.org"), "rb") as f: |     with open(example_doc_path("README.org"), "rb") as f: | ||||||
|         elements = partition_org(file=f) |         elements = partition_org(file=f) | ||||||
| 
 | 
 | ||||||
|     assert elements[0] == Title("Example Docs") |     assert elements[0] == Title("Example Docs") | ||||||
|     assert elements[0].metadata.filetype == "text/org" |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_org_from_file_with_metadata_filename(): | # -- .metadata.filename -------------------------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_org_from_filename_gets_filename_from_filename_arg(): | ||||||
|  |     elements = partition_org(example_doc_path("README.org")) | ||||||
|  | 
 | ||||||
|  |     assert len(elements) > 0 | ||||||
|  |     assert all(e.metadata.filename == "README.org" for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_org_from_file_gets_filename_None(): | ||||||
|     with open(example_doc_path("README.org"), "rb") as f: |     with open(example_doc_path("README.org"), "rb") as f: | ||||||
|         elements = partition_org(file=f, metadata_filename="test") |         elements = partition_org(file=f) | ||||||
| 
 | 
 | ||||||
|     assert elements[0] == Title("Example Docs") |     assert len(elements) > 0 | ||||||
|     assert elements[0].metadata.filename == "test" |     assert all(e.metadata.filename is None for e in elements) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_org_pulls_last_modified_from_filesystem(mocker: MockFixture): | def test_partition_org_from_filename_prefers_metadata_filename(): | ||||||
|  |     elements = partition_org(example_doc_path("README.org"), metadata_filename="orig-name.org") | ||||||
|  | 
 | ||||||
|  |     assert len(elements) > 0 | ||||||
|  |     assert all(element.metadata.filename == "orig-name.org" for element in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_org_from_file_prefers_metadata_filename(): | ||||||
|  |     with open(example_doc_path("README.org"), "rb") as f: | ||||||
|  |         elements = partition_org(file=f, metadata_filename="orig-name.org") | ||||||
|  | 
 | ||||||
|  |     assert all(e.metadata.filename == "orig-name.org" for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # -- .metadata.filetype -------------------------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_org_gets_the_ORG_MIME_type_in_metadata_filetype(): | ||||||
|  |     ORG_MIME_TYPE = "text/org" | ||||||
|  |     elements = partition_org(example_doc_path("README.org")) | ||||||
|  |     assert all(e.metadata.filetype == ORG_MIME_TYPE for e in elements), ( | ||||||
|  |         f"Expected all elements to have '{ORG_MIME_TYPE}' as their filetype, but got:" | ||||||
|  |         f" {repr(elements[0].metadata.filetype)}" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # -- .metadata.last_modified --------------------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_org_from_filename_gets_last_modified_from_filesystem(mocker: MockFixture): | ||||||
|     filesystem_last_modified = "2024-06-14T16:01:29" |     filesystem_last_modified = "2024-06-14T16:01:29" | ||||||
|     mocker.patch( |     mocker.patch( | ||||||
|         "unstructured.partition.org.get_last_modified_date", return_value=filesystem_last_modified |         "unstructured.partition.org.get_last_modified_date", return_value=filesystem_last_modified | ||||||
| @ -46,10 +77,17 @@ def test_partition_org_pulls_last_modified_from_filesystem(mocker: MockFixture): | |||||||
| 
 | 
 | ||||||
|     elements = partition_org(example_doc_path("README.org")) |     elements = partition_org(example_doc_path("README.org")) | ||||||
| 
 | 
 | ||||||
|     assert elements[0].metadata.last_modified == filesystem_last_modified |     assert all(e.metadata.last_modified == filesystem_last_modified for e in elements) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_org_prefers_metadata_last_modified(mocker: MockFixture): | def test_partition_org_from_file_gets_last_modified_None(): | ||||||
|  |     with open(example_doc_path("README.org"), "rb") as f: | ||||||
|  |         elements = partition_org(file=f) | ||||||
|  | 
 | ||||||
|  |     assert all(e.metadata.last_modified is None for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_org_from_filename_prefers_metadata_last_modified(mocker: MockFixture): | ||||||
|     filesystem_last_modified = "2020-08-04T06:11:47" |     filesystem_last_modified = "2020-08-04T06:11:47" | ||||||
|     metadata_last_modified = "2024-06-14T16:01:29" |     metadata_last_modified = "2024-06-14T16:01:29" | ||||||
|     mocker.patch( |     mocker.patch( | ||||||
| @ -63,6 +101,17 @@ def test_partition_org_prefers_metadata_last_modified(mocker: MockFixture): | |||||||
|     assert all(e.metadata.last_modified == metadata_last_modified for e in elements) |     assert all(e.metadata.last_modified == metadata_last_modified for e in elements) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_partition_org_from_file_prefers_metadata_last_modified(): | ||||||
|  |     metadata_last_modified = "2020-07-05T09:24:28" | ||||||
|  |     with open(example_doc_path("README.org"), "rb") as f: | ||||||
|  |         elements = partition_org(file=f, metadata_last_modified=metadata_last_modified) | ||||||
|  | 
 | ||||||
|  |     assert all(e.metadata.last_modified == metadata_last_modified for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ------------------------------------------------------------------------------------------------ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_partition_org_with_json(): | def test_partition_org_with_json(): | ||||||
|     elements = partition_org(example_doc_path("README.org")) |     elements = partition_org(example_doc_path("README.org")) | ||||||
|     assert_round_trips_through_JSON(elements) |     assert_round_trips_through_JSON(elements) | ||||||
|  | |||||||
| @ -10,21 +10,7 @@ from unstructured.partition.rst import partition_rst | |||||||
| 
 | 
 | ||||||
| def test_partition_rst_from_filename(): | def test_partition_rst_from_filename(): | ||||||
|     elements = partition_rst(example_doc_path("README.rst")) |     elements = partition_rst(example_doc_path("README.rst")) | ||||||
| 
 |  | ||||||
|     assert elements[0] == Title("Example Docs") |     assert elements[0] == Title("Example Docs") | ||||||
|     assert elements[0].metadata.filetype == "text/x-rst" |  | ||||||
|     for element in elements: |  | ||||||
|         assert element.metadata.filename == "README.rst" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_partition_rst_from_filename_returns_uns_elements(): |  | ||||||
|     elements = partition_rst(example_doc_path("README.rst")) |  | ||||||
|     assert isinstance(elements[0], Title) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_partition_rst_from_filename_with_metadata_filename(): |  | ||||||
|     elements = partition_rst(example_doc_path("README.rst"), metadata_filename="test") |  | ||||||
|     assert all(element.metadata.filename == "test" for element in elements) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_rst_from_file(): | def test_partition_rst_from_file(): | ||||||
| @ -32,18 +18,50 @@ def test_partition_rst_from_file(): | |||||||
|         elements = partition_rst(file=f) |         elements = partition_rst(file=f) | ||||||
| 
 | 
 | ||||||
|     assert elements[0] == Title("Example Docs") |     assert elements[0] == Title("Example Docs") | ||||||
|     assert elements[0].metadata.filetype == "text/x-rst" |  | ||||||
|     for element in elements: |  | ||||||
|         assert element.metadata.filename is None |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_rst_from_file_with_metadata_filename(): | # -- .metadata.filename -------------------------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_rst_from_filename_gets_filename_from_filename_arg(): | ||||||
|  |     elements = partition_rst(example_doc_path("README.rst")) | ||||||
|  | 
 | ||||||
|  |     assert len(elements) > 0 | ||||||
|  |     assert all(e.metadata.filename == "README.rst" for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_rst_from_file_gets_filename_None(): | ||||||
|     with open(example_doc_path("README.rst"), "rb") as f: |     with open(example_doc_path("README.rst"), "rb") as f: | ||||||
|         elements = partition_rst(file=f, metadata_filename="test") |         elements = partition_rst(file=f) | ||||||
| 
 | 
 | ||||||
|     assert elements[0] == Title("Example Docs") |     assert len(elements) > 0 | ||||||
|     for element in elements: |     assert all(e.metadata.filename is None for e in elements) | ||||||
|         assert element.metadata.filename == "test" | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_rst_from_filename_prefers_metadata_filename(): | ||||||
|  |     elements = partition_rst(example_doc_path("README.rst"), metadata_filename="orig-name.rst") | ||||||
|  | 
 | ||||||
|  |     assert len(elements) > 0 | ||||||
|  |     assert all(element.metadata.filename == "orig-name.rst" for element in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_rst_from_file_prefers_metadata_filename(): | ||||||
|  |     with open(example_doc_path("README.rst"), "rb") as f: | ||||||
|  |         elements = partition_rst(file=f, metadata_filename="orig-name.rst") | ||||||
|  | 
 | ||||||
|  |     assert all(e.metadata.filename == "orig-name.rst" for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # -- .metadata.filetype -------------------------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_rst_gets_the_RST_MIME_type_in_metadata_filetype(): | ||||||
|  |     RST_MIME_TYPE = "text/x-rst" | ||||||
|  |     elements = partition_rst(example_doc_path("README.rst")) | ||||||
|  |     assert all(e.metadata.filetype == RST_MIME_TYPE for e in elements), ( | ||||||
|  |         f"Expected all elements to have '{RST_MIME_TYPE}' as their filetype, but got:" | ||||||
|  |         f" {repr(elements[0].metadata.filetype)}" | ||||||
|  |     ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # -- .metadata.last_modified --------------------------------------------------------------------- | # -- .metadata.last_modified --------------------------------------------------------------------- | ||||||
|  | |||||||
| @ -9,41 +9,68 @@ from unstructured.partition.rtf import partition_rtf | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_rtf_from_filename(): | def test_partition_rtf_from_filename(): | ||||||
|     filename = example_doc_path("fake-doc.rtf") |     elements = partition_rtf(example_doc_path("fake-doc.rtf")) | ||||||
|     elements = partition_rtf(filename=filename) | 
 | ||||||
|     assert len(elements) > 0 |     assert len(elements) > 0 | ||||||
|     assert elements[0] == Title("My First Heading") |     assert elements[0] == Title("My First Heading") | ||||||
|     assert elements[-1] == Table( |     assert elements[-1] == Table( | ||||||
|         text="Column 1 Column 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2" |         text="Column 1 Column 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2" | ||||||
|     ) |     ) | ||||||
|     for element in elements: |  | ||||||
|         assert element.metadata.filename == "fake-doc.rtf" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_partition_rtf_from_filename_with_metadata_filename(): |  | ||||||
|     filename = example_doc_path("fake-doc.rtf") |  | ||||||
|     elements = partition_rtf(filename=filename, metadata_filename="test") |  | ||||||
|     assert len(elements) > 0 |  | ||||||
|     assert all(element.metadata.filename == "test" for element in elements) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_rtf_from_file(): | def test_partition_rtf_from_file(): | ||||||
|     filename = example_doc_path("fake-doc.rtf") |     with open(example_doc_path("fake-doc.rtf"), "rb") as f: | ||||||
|     with open(filename, "rb") as f: |  | ||||||
|         elements = partition_rtf(file=f) |         elements = partition_rtf(file=f) | ||||||
|  | 
 | ||||||
|     assert len(elements) > 0 |     assert len(elements) > 0 | ||||||
|     assert elements[0] == Title("My First Heading") |     assert elements[0] == Title("My First Heading") | ||||||
|     for element in elements: |  | ||||||
|         assert element.metadata.filename is None |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_rtf_from_file_with_metadata_filename(): | # -- .metadata.filename -------------------------------------------------------------------------- | ||||||
|     filename = example_doc_path("fake-doc.rtf") | 
 | ||||||
|     with open(filename, "rb") as f: | 
 | ||||||
|         elements = partition_rtf(file=f, metadata_filename="test") | def test_partition_rtf_from_filename_gets_filename_from_filename_arg(): | ||||||
|     assert elements[0] == Title("My First Heading") |     elements = partition_rtf(example_doc_path("fake-doc.rtf")) | ||||||
|     for element in elements: | 
 | ||||||
|         assert element.metadata.filename == "test" |     assert len(elements) > 0 | ||||||
|  |     assert all(e.metadata.filename == "fake-doc.rtf" for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_rtf_from_file_gets_filename_None(): | ||||||
|  |     with open(example_doc_path("fake-doc.rtf"), "rb") as f: | ||||||
|  |         elements = partition_rtf(file=f) | ||||||
|  | 
 | ||||||
|  |     assert len(elements) > 0 | ||||||
|  |     assert all(e.metadata.filename is None for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_rtf_from_filename_prefers_metadata_filename(): | ||||||
|  |     elements = partition_rtf(example_doc_path("fake-doc.rtf"), metadata_filename="orig-name.rtf") | ||||||
|  | 
 | ||||||
|  |     assert len(elements) > 0 | ||||||
|  |     assert all(element.metadata.filename == "orig-name.rtf" for element in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_rtf_from_file_prefers_metadata_filename(): | ||||||
|  |     with open(example_doc_path("fake-doc.rtf"), "rb") as f: | ||||||
|  |         elements = partition_rtf(file=f, metadata_filename="orig-name.rtf") | ||||||
|  | 
 | ||||||
|  |     assert all(e.metadata.filename == "orig-name.rtf" for e in elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # -- .metadata.filetype -------------------------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_partition_rtf_gets_the_RTF_MIME_type_in_metadata_filetype(): | ||||||
|  |     RTF_MIME_TYPE = "text/rtf" | ||||||
|  |     elements = partition_rtf(example_doc_path("fake-doc.rtf")) | ||||||
|  |     assert all(e.metadata.filetype == RTF_MIME_TYPE for e in elements), ( | ||||||
|  |         f"Expected all elements to have '{RTF_MIME_TYPE}' as their filetype, but got:" | ||||||
|  |         f" {repr(elements[0].metadata.filetype)}" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # -- .metadata.last_modified --------------------------------------------------------------------- | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_partition_rtf_pulls_last_modified_from_filesystem(mocker: MockFixture): | def test_partition_rtf_pulls_last_modified_from_filesystem(mocker: MockFixture): | ||||||
| @ -70,6 +97,9 @@ def test_partition_rtf_prefers_metadata_last_modified(mocker: MockFixture): | |||||||
|     assert all(e.metadata.last_modified == metadata_last_modified for e in elements) |     assert all(e.metadata.last_modified == metadata_last_modified for e in elements) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | # -- other --------------------------------------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_partition_rtf_with_json(): | def test_partition_rtf_with_json(): | ||||||
|     elements = partition_rtf(filename=example_doc_path("fake-doc.rtf")) |     elements = partition_rtf(filename=example_doc_path("fake-doc.rtf")) | ||||||
|     assert_round_trips_through_JSON(elements) |     assert_round_trips_through_JSON(elements) | ||||||
|  | |||||||
| @ -1 +1 @@ | |||||||
| __version__ = "0.15.14-dev8"  # pragma: no cover | __version__ = "0.15.14-dev9"  # pragma: no cover | ||||||
|  | |||||||
| @ -2,10 +2,8 @@ from __future__ import annotations | |||||||
| 
 | 
 | ||||||
| from typing import IO, Any, Optional | from typing import IO, Any, Optional | ||||||
| 
 | 
 | ||||||
| from unstructured.chunking import add_chunking_strategy | from unstructured.documents.elements import Element | ||||||
| from unstructured.documents.elements import Element, process_metadata |  | ||||||
| from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc | from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc | ||||||
| from unstructured.file_utils.filetype import add_metadata_with_filetype |  | ||||||
| from unstructured.file_utils.model import FileType | from unstructured.file_utils.model import FileType | ||||||
| from unstructured.partition.common.common import exactly_one | from unstructured.partition.common.common import exactly_one | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| @ -14,9 +12,6 @@ from unstructured.partition.html import partition_html | |||||||
| DETECTION_ORIGIN: str = "epub" | DETECTION_ORIGIN: str = "epub" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @process_metadata() |  | ||||||
| @add_metadata_with_filetype(FileType.EPUB) |  | ||||||
| @add_chunking_strategy |  | ||||||
| def partition_epub( | def partition_epub( | ||||||
|     filename: Optional[str] = None, |     filename: Optional[str] = None, | ||||||
|     *, |     *, | ||||||
| @ -57,9 +52,11 @@ def partition_epub( | |||||||
|     return partition_html( |     return partition_html( | ||||||
|         text=html_text, |         text=html_text, | ||||||
|         encoding="unicode", |         encoding="unicode", | ||||||
|         metadata_filename=metadata_filename, |         metadata_filename=metadata_filename or filename, | ||||||
|  |         metadata_file_type=FileType.EPUB, | ||||||
|         metadata_last_modified=metadata_last_modified or last_modified, |         metadata_last_modified=metadata_last_modified or last_modified, | ||||||
|         languages=languages, |         languages=languages, | ||||||
|         detect_language_per_element=detect_language_per_element, |         detect_language_per_element=detect_language_per_element, | ||||||
|         detection_origin=DETECTION_ORIGIN, |         detection_origin=DETECTION_ORIGIN, | ||||||
|  |         **kwargs, | ||||||
|     ) |     ) | ||||||
|  | |||||||
| @ -10,18 +10,15 @@ import requests | |||||||
| from lxml import etree | from lxml import etree | ||||||
| 
 | 
 | ||||||
| from unstructured.chunking import add_chunking_strategy | from unstructured.chunking import add_chunking_strategy | ||||||
| from unstructured.documents.elements import Element, process_metadata | from unstructured.documents.elements import Element | ||||||
| from unstructured.file_utils.encoding import read_txt_file | from unstructured.file_utils.encoding import read_txt_file | ||||||
| from unstructured.file_utils.filetype import add_metadata_with_filetype |  | ||||||
| from unstructured.file_utils.model import FileType | from unstructured.file_utils.model import FileType | ||||||
| from unstructured.partition.common.lang import apply_lang_metadata | from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date |  | ||||||
| from unstructured.partition.html.parser import Flow, html_parser | from unstructured.partition.html.parser import Flow, html_parser | ||||||
| from unstructured.utils import is_temp_file_path, lazyproperty | from unstructured.utils import is_temp_file_path, lazyproperty | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @process_metadata() | @apply_metadata(FileType.HTML) | ||||||
| @add_metadata_with_filetype(FileType.HTML) |  | ||||||
| @add_chunking_strategy | @add_chunking_strategy | ||||||
| def partition_html( | def partition_html( | ||||||
|     filename: Optional[str] = None, |     filename: Optional[str] = None, | ||||||
| @ -32,9 +29,6 @@ def partition_html( | |||||||
|     url: Optional[str] = None, |     url: Optional[str] = None, | ||||||
|     headers: dict[str, str] = {}, |     headers: dict[str, str] = {}, | ||||||
|     ssl_verify: bool = True, |     ssl_verify: bool = True, | ||||||
|     detect_language_per_element: bool = False, |  | ||||||
|     languages: Optional[list[str]] = ["auto"], |  | ||||||
|     metadata_last_modified: Optional[str] = None, |  | ||||||
|     skip_headers_and_footers: bool = False, |     skip_headers_and_footers: bool = False, | ||||||
|     detection_origin: Optional[str] = None, |     detection_origin: Optional[str] = None, | ||||||
|     **kwargs: Any, |     **kwargs: Any, | ||||||
| @ -60,18 +54,6 @@ def partition_html( | |||||||
|         on the HTTP request. |         on the HTTP request. | ||||||
|     encoding |     encoding | ||||||
|         The encoding method used to decode the text input. If None, utf-8 will be used. |         The encoding method used to decode the text input. If None, utf-8 will be used. | ||||||
| 
 |  | ||||||
|     Other parameters |  | ||||||
|     ---------------- |  | ||||||
|     languages |  | ||||||
|         User defined value for `metadata.languages` if provided. Otherwise language is detected |  | ||||||
|         using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be |  | ||||||
|         in either language. |  | ||||||
|         Additional Parameters: |  | ||||||
|             detect_language_per_element |  | ||||||
|                 Detect language per element instead of at the document level. |  | ||||||
|     metadata_last_modified |  | ||||||
|         The last modified date for the document. |  | ||||||
|     skip_headers_and_footers |     skip_headers_and_footers | ||||||
|         If True, ignores any content that is within <header> or <footer> tags |         If True, ignores any content that is within <header> or <footer> tags | ||||||
|     """ |     """ | ||||||
| @ -87,20 +69,11 @@ def partition_html( | |||||||
|         url=url, |         url=url, | ||||||
|         headers=headers, |         headers=headers, | ||||||
|         ssl_verify=ssl_verify, |         ssl_verify=ssl_verify, | ||||||
|         metadata_last_modified=metadata_last_modified, |  | ||||||
|         skip_headers_and_footers=skip_headers_and_footers, |         skip_headers_and_footers=skip_headers_and_footers, | ||||||
|         detection_origin=detection_origin, |         detection_origin=detection_origin, | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     elements = list( |     return list(_HtmlPartitioner.iter_elements(opts)) | ||||||
|         apply_lang_metadata( |  | ||||||
|             _HtmlPartitioner.iter_elements(opts), |  | ||||||
|             languages=languages, |  | ||||||
|             detect_language_per_element=detect_language_per_element, |  | ||||||
|         ) |  | ||||||
|     ) |  | ||||||
| 
 |  | ||||||
|     return elements |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class HtmlPartitionerOptions: | class HtmlPartitionerOptions: | ||||||
| @ -116,7 +89,6 @@ class HtmlPartitionerOptions: | |||||||
|         url: str | None, |         url: str | None, | ||||||
|         headers: dict[str, str], |         headers: dict[str, str], | ||||||
|         ssl_verify: bool, |         ssl_verify: bool, | ||||||
|         metadata_last_modified: str | None, |  | ||||||
|         skip_headers_and_footers: bool, |         skip_headers_and_footers: bool, | ||||||
|         detection_origin: str | None, |         detection_origin: str | None, | ||||||
|     ): |     ): | ||||||
| @ -127,7 +99,6 @@ class HtmlPartitionerOptions: | |||||||
|         self._url = url |         self._url = url | ||||||
|         self._headers = headers |         self._headers = headers | ||||||
|         self._ssl_verify = ssl_verify |         self._ssl_verify = ssl_verify | ||||||
|         self._metadata_last_modified = metadata_last_modified |  | ||||||
|         self._skip_headers_and_footers = skip_headers_and_footers |         self._skip_headers_and_footers = skip_headers_and_footers | ||||||
|         self._detection_origin = detection_origin |         self._detection_origin = detection_origin | ||||||
| 
 | 
 | ||||||
| @ -173,19 +144,11 @@ class HtmlPartitionerOptions: | |||||||
|     @lazyproperty |     @lazyproperty | ||||||
|     def last_modified(self) -> str | None: |     def last_modified(self) -> str | None: | ||||||
|         """The best last-modified date available, None if no sources are available.""" |         """The best last-modified date available, None if no sources are available.""" | ||||||
|         # -- Value explicitly specified by caller takes precedence. This is used for example when |         return ( | ||||||
|         # -- this file was converted from another format. |             None | ||||||
|         if self._metadata_last_modified: |             if not self._file_path or is_temp_file_path(self._file_path) | ||||||
|             return self._metadata_last_modified |             else get_last_modified_date(self._file_path) | ||||||
| 
 |         ) | ||||||
|         if self._file_path: |  | ||||||
|             return ( |  | ||||||
|                 None |  | ||||||
|                 if is_temp_file_path(self._file_path) |  | ||||||
|                 else get_last_modified_date(self._file_path) |  | ||||||
|             ) |  | ||||||
| 
 |  | ||||||
|         return None |  | ||||||
| 
 | 
 | ||||||
|     @lazyproperty |     @lazyproperty | ||||||
|     def skip_headers_and_footers(self) -> bool: |     def skip_headers_and_footers(self) -> bool: | ||||||
|  | |||||||
| @ -1,20 +1,18 @@ | |||||||
| from __future__ import annotations | from __future__ import annotations | ||||||
| 
 | 
 | ||||||
| from typing import IO, Any, Optional, Union | from typing import IO, Any | ||||||
| 
 | 
 | ||||||
| import markdown | import markdown | ||||||
| import requests | import requests | ||||||
| 
 | 
 | ||||||
| from unstructured.chunking import add_chunking_strategy | from unstructured.documents.elements import Element | ||||||
| from unstructured.documents.elements import Element, process_metadata |  | ||||||
| from unstructured.file_utils.filetype import add_metadata_with_filetype |  | ||||||
| from unstructured.file_utils.model import FileType | from unstructured.file_utils.model import FileType | ||||||
| from unstructured.partition.common.common import exactly_one | from unstructured.partition.common.common import exactly_one | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| from unstructured.partition.html import partition_html | from unstructured.partition.html import partition_html | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def optional_decode(contents: Union[str, bytes]) -> str: | def optional_decode(contents: str | bytes) -> str: | ||||||
|     if isinstance(contents, bytes): |     if isinstance(contents, bytes): | ||||||
|         return contents.decode("utf-8") |         return contents.decode("utf-8") | ||||||
|     return contents |     return contents | ||||||
| @ -23,19 +21,13 @@ def optional_decode(contents: Union[str, bytes]) -> str: | |||||||
| DETECTION_ORIGIN: str = "md" | DETECTION_ORIGIN: str = "md" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @process_metadata() |  | ||||||
| @add_metadata_with_filetype(FileType.MD) |  | ||||||
| @add_chunking_strategy |  | ||||||
| def partition_md( | def partition_md( | ||||||
|     filename: Optional[str] = None, |     filename: str | None = None, | ||||||
|     file: Optional[IO[bytes]] = None, |     file: IO[bytes] | None = None, | ||||||
|     text: Optional[str] = None, |     text: str | None = None, | ||||||
|     url: Optional[str] = None, |     url: str | None = None, | ||||||
|     include_page_breaks: bool = False, |     metadata_filename: str | None = None, | ||||||
|     metadata_filename: Optional[str] = None, |     metadata_last_modified: str | None = None, | ||||||
|     metadata_last_modified: Optional[str] = None, |  | ||||||
|     languages: Optional[list[str]] = ["auto"], |  | ||||||
|     detect_language_per_element: bool = False, |  | ||||||
|     **kwargs: Any, |     **kwargs: Any, | ||||||
| ) -> list[Element]: | ) -> list[Element]: | ||||||
|     """Partitions a markdown file into its constituent elements |     """Partitions a markdown file into its constituent elements | ||||||
| @ -50,24 +42,13 @@ def partition_md( | |||||||
|         The string representation of the markdown document. |         The string representation of the markdown document. | ||||||
|     url |     url | ||||||
|         The URL of a webpage to parse. Only for URLs that return a markdown document. |         The URL of a webpage to parse. Only for URLs that return a markdown document. | ||||||
|     include_page_breaks |  | ||||||
|         If True, the output will include page breaks if the filetype supports it. |  | ||||||
|     parser |  | ||||||
|         The parser to use for parsing the markdown document. If None, default parser will be used. |  | ||||||
|     metadata_last_modified |     metadata_last_modified | ||||||
|         The last modified date for the document. |         The last modified date for the document. | ||||||
|     languages |  | ||||||
|         User defined value for `metadata.languages` if provided. Otherwise language is detected |  | ||||||
|         using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be |  | ||||||
|         in either language. |  | ||||||
|         Additional Parameters: |  | ||||||
|             detect_language_per_element |  | ||||||
|                 Detect language per element instead of at the document level. |  | ||||||
|     """ |     """ | ||||||
|     # Verify that only one of the arguments was provided |  | ||||||
|     if text is None: |     if text is None: | ||||||
|         text = "" |         text = "" | ||||||
| 
 | 
 | ||||||
|  |     # -- verify that only one of the arguments was provided -- | ||||||
|     exactly_one(filename=filename, file=file, text=text, url=url) |     exactly_one(filename=filename, file=file, text=text, url=url) | ||||||
| 
 | 
 | ||||||
|     last_modified = get_last_modified_date(filename) if filename else None |     last_modified = get_last_modified_date(filename) if filename else None | ||||||
| @ -96,11 +77,9 @@ def partition_md( | |||||||
| 
 | 
 | ||||||
|     return partition_html( |     return partition_html( | ||||||
|         text=html, |         text=html, | ||||||
|         include_page_breaks=include_page_breaks, |         metadata_filename=metadata_filename or filename, | ||||||
|         source_format="md", |         metadata_file_type=FileType.MD, | ||||||
|         metadata_filename=metadata_filename, |  | ||||||
|         metadata_last_modified=metadata_last_modified or last_modified, |         metadata_last_modified=metadata_last_modified or last_modified, | ||||||
|         languages=languages, |  | ||||||
|         detect_language_per_element=detect_language_per_element, |  | ||||||
|         detection_origin=DETECTION_ORIGIN, |         detection_origin=DETECTION_ORIGIN, | ||||||
|  |         **kwargs, | ||||||
|     ) |     ) | ||||||
|  | |||||||
| @ -1,11 +1,9 @@ | |||||||
| from __future__ import annotations | from __future__ import annotations | ||||||
| 
 | 
 | ||||||
| from typing import IO, Any, Optional | from typing import IO, Any | ||||||
| 
 | 
 | ||||||
| from unstructured.chunking import add_chunking_strategy |  | ||||||
| from unstructured.documents.elements import Element | from unstructured.documents.elements import Element | ||||||
| from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc | from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc | ||||||
| from unstructured.file_utils.filetype import add_metadata_with_filetype |  | ||||||
| from unstructured.file_utils.model import FileType | from unstructured.file_utils.model import FileType | ||||||
| from unstructured.partition.common.common import exactly_one | from unstructured.partition.common.common import exactly_one | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| @ -14,16 +12,12 @@ from unstructured.partition.html import partition_html | |||||||
| DETECTION_ORIGIN: str = "org" | DETECTION_ORIGIN: str = "org" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @add_metadata_with_filetype(FileType.ORG) |  | ||||||
| @add_chunking_strategy |  | ||||||
| def partition_org( | def partition_org( | ||||||
|     filename: Optional[str] = None, |     filename: str | None = None, | ||||||
|     *, |     *, | ||||||
|     file: Optional[IO[bytes]] = None, |     file: IO[bytes] | None = None, | ||||||
|     metadata_filename: Optional[str] = None, |     metadata_filename: str | None = None, | ||||||
|     metadata_last_modified: Optional[str] = None, |     metadata_last_modified: str | None = None, | ||||||
|     languages: Optional[list[str]] = ["auto"], |  | ||||||
|     detect_language_per_element: bool = False, |  | ||||||
|     **kwargs: Any, |     **kwargs: Any, | ||||||
| ) -> list[Element]: | ) -> list[Element]: | ||||||
|     """Partitions an org document. The document is first converted to HTML and then |     """Partitions an org document. The document is first converted to HTML and then | ||||||
| @ -37,13 +31,6 @@ def partition_org( | |||||||
|         A file-like object using "rb" mode --> open(filename, "rb"). |         A file-like object using "rb" mode --> open(filename, "rb"). | ||||||
|     metadata_last_modified |     metadata_last_modified | ||||||
|         The last modified date for the document. |         The last modified date for the document. | ||||||
|     languages |  | ||||||
|         User defined value for `metadata.languages` if provided. Otherwise language is detected |  | ||||||
|         using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be |  | ||||||
|         in either language. |  | ||||||
|         Additional Parameters: |  | ||||||
|             detect_language_per_element |  | ||||||
|                 Detect language per element instead of at the document level. |  | ||||||
|     """ |     """ | ||||||
|     exactly_one(filename=filename, file=file) |     exactly_one(filename=filename, file=file) | ||||||
| 
 | 
 | ||||||
| @ -56,9 +43,9 @@ def partition_org( | |||||||
|     return partition_html( |     return partition_html( | ||||||
|         text=html_text, |         text=html_text, | ||||||
|         encoding="unicode", |         encoding="unicode", | ||||||
|         metadata_filename=metadata_filename, |         metadata_filename=metadata_filename or filename, | ||||||
|  |         metadata_file_type=FileType.ORG, | ||||||
|         metadata_last_modified=metadata_last_modified or last_modified, |         metadata_last_modified=metadata_last_modified or last_modified, | ||||||
|         languages=languages, |  | ||||||
|         detect_language_per_element=detect_language_per_element, |  | ||||||
|         detection_origin=DETECTION_ORIGIN, |         detection_origin=DETECTION_ORIGIN, | ||||||
|  |         **kwargs, | ||||||
|     ) |     ) | ||||||
|  | |||||||
| @ -2,10 +2,8 @@ from __future__ import annotations | |||||||
| 
 | 
 | ||||||
| from typing import IO, Any, Optional | from typing import IO, Any, Optional | ||||||
| 
 | 
 | ||||||
| from unstructured.chunking import add_chunking_strategy | from unstructured.documents.elements import Element | ||||||
| from unstructured.documents.elements import Element, process_metadata |  | ||||||
| from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc | from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc | ||||||
| from unstructured.file_utils.filetype import add_metadata_with_filetype |  | ||||||
| from unstructured.file_utils.model import FileType | from unstructured.file_utils.model import FileType | ||||||
| from unstructured.partition.common.common import exactly_one | from unstructured.partition.common.common import exactly_one | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| @ -14,17 +12,12 @@ from unstructured.partition.html import partition_html | |||||||
| DETECTION_ORIGIN: str = "rst" | DETECTION_ORIGIN: str = "rst" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @process_metadata() |  | ||||||
| @add_metadata_with_filetype(FileType.RST) |  | ||||||
| @add_chunking_strategy |  | ||||||
| def partition_rst( | def partition_rst( | ||||||
|     filename: Optional[str] = None, |     filename: Optional[str] = None, | ||||||
|     *, |     *, | ||||||
|     file: Optional[IO[bytes]] = None, |     file: Optional[IO[bytes]] = None, | ||||||
|     metadata_filename: Optional[str] = None, |     metadata_filename: Optional[str] = None, | ||||||
|     metadata_last_modified: Optional[str] = None, |     metadata_last_modified: Optional[str] = None, | ||||||
|     languages: Optional[list[str]] = ["auto"], |  | ||||||
|     detect_language_per_element: bool = False, |  | ||||||
|     **kwargs: Any, |     **kwargs: Any, | ||||||
| ) -> list[Element]: | ) -> list[Element]: | ||||||
|     """Partitions an RST document. The document is first converted to HTML and then |     """Partitions an RST document. The document is first converted to HTML and then | ||||||
| @ -38,13 +31,6 @@ def partition_rst( | |||||||
|         A file-like object using "rb" mode --> open(filename, "rb"). |         A file-like object using "rb" mode --> open(filename, "rb"). | ||||||
|     metadata_last_modified |     metadata_last_modified | ||||||
|         The last modified date for the document. |         The last modified date for the document. | ||||||
|     languages |  | ||||||
|         User defined value for `metadata.languages` if provided. Otherwise language is detected |  | ||||||
|         using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be |  | ||||||
|         in either language. |  | ||||||
|         Additional Parameters: |  | ||||||
|             detect_language_per_element |  | ||||||
|                 Detect language per element instead of at the document level. |  | ||||||
|     """ |     """ | ||||||
|     exactly_one(filename=filename, file=file) |     exactly_one(filename=filename, file=file) | ||||||
| 
 | 
 | ||||||
| @ -57,9 +43,9 @@ def partition_rst( | |||||||
|     return partition_html( |     return partition_html( | ||||||
|         text=html_text, |         text=html_text, | ||||||
|         encoding="unicode", |         encoding="unicode", | ||||||
|         metadata_filename=metadata_filename, |         metadata_filename=metadata_filename or filename, | ||||||
|  |         metadata_file_type=FileType.RST, | ||||||
|         metadata_last_modified=metadata_last_modified or last_modified, |         metadata_last_modified=metadata_last_modified or last_modified, | ||||||
|         languages=languages, |  | ||||||
|         detect_language_per_element=detect_language_per_element, |  | ||||||
|         detection_origin=DETECTION_ORIGIN, |         detection_origin=DETECTION_ORIGIN, | ||||||
|  |         **kwargs, | ||||||
|     ) |     ) | ||||||
|  | |||||||
| @ -2,10 +2,8 @@ from __future__ import annotations | |||||||
| 
 | 
 | ||||||
| from typing import IO, Any, Optional | from typing import IO, Any, Optional | ||||||
| 
 | 
 | ||||||
| from unstructured.chunking import add_chunking_strategy | from unstructured.documents.elements import Element | ||||||
| from unstructured.documents.elements import Element, process_metadata |  | ||||||
| from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc | from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc | ||||||
| from unstructured.file_utils.filetype import add_metadata_with_filetype |  | ||||||
| from unstructured.file_utils.model import FileType | from unstructured.file_utils.model import FileType | ||||||
| from unstructured.partition.common.common import exactly_one | from unstructured.partition.common.common import exactly_one | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| @ -14,17 +12,12 @@ from unstructured.partition.html import partition_html | |||||||
| DETECTION_ORIGIN: str = "rtf" | DETECTION_ORIGIN: str = "rtf" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @process_metadata() |  | ||||||
| @add_metadata_with_filetype(FileType.RTF) |  | ||||||
| @add_chunking_strategy |  | ||||||
| def partition_rtf( | def partition_rtf( | ||||||
|     filename: Optional[str] = None, |     filename: Optional[str] = None, | ||||||
|     *, |     *, | ||||||
|     file: Optional[IO[bytes]] = None, |     file: Optional[IO[bytes]] = None, | ||||||
|     metadata_filename: Optional[str] = None, |     metadata_filename: Optional[str] = None, | ||||||
|     metadata_last_modified: Optional[str] = None, |     metadata_last_modified: Optional[str] = None, | ||||||
|     languages: Optional[list[str]] = ["auto"], |  | ||||||
|     detect_language_per_element: bool = False, |  | ||||||
|     **kwargs: Any, |     **kwargs: Any, | ||||||
| ) -> list[Element]: | ) -> list[Element]: | ||||||
|     """Partitions an RTF document. The document is first converted to HTML and then |     """Partitions an RTF document. The document is first converted to HTML and then | ||||||
| @ -38,13 +31,6 @@ def partition_rtf( | |||||||
|         A file-like object using "rb" mode --> open(filename, "rb"). |         A file-like object using "rb" mode --> open(filename, "rb"). | ||||||
|     metadata_last_modified |     metadata_last_modified | ||||||
|         The last modified date for the document. |         The last modified date for the document. | ||||||
|     languages |  | ||||||
|         User defined value for `metadata.languages` if provided. Otherwise language is detected |  | ||||||
|         using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be |  | ||||||
|         in either language. |  | ||||||
|         Additional Parameters: |  | ||||||
|             detect_language_per_element |  | ||||||
|                 Detect language per element instead of at the document level. |  | ||||||
|     """ |     """ | ||||||
|     exactly_one(filename=filename, file=file) |     exactly_one(filename=filename, file=file) | ||||||
| 
 | 
 | ||||||
| @ -57,9 +43,9 @@ def partition_rtf( | |||||||
|     return partition_html( |     return partition_html( | ||||||
|         text=html_text, |         text=html_text, | ||||||
|         encoding="unicode", |         encoding="unicode", | ||||||
|         metadata_filename=metadata_filename, |         metadata_filename=metadata_filename or filename, | ||||||
|  |         metadata_file_type=FileType.RTF, | ||||||
|         metadata_last_modified=metadata_last_modified or last_modified, |         metadata_last_modified=metadata_last_modified or last_modified, | ||||||
|         languages=languages, |  | ||||||
|         detect_language_per_element=detect_language_per_element, |  | ||||||
|         detection_origin=DETECTION_ORIGIN, |         detection_origin=DETECTION_ORIGIN, | ||||||
|  |         **kwargs, | ||||||
|     ) |     ) | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Steve Canny
						Steve Canny