mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 10:03:07 +00:00 
			
		
		
		
	 d9aed66b65
			
		
	
	
		d9aed66b65
		
			
		
	
	
	
	
		
			
			* feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
		
			
				
	
	
		
			764 lines
		
	
	
		
			24 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			764 lines
		
	
	
		
			24 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import os
 | |
| from tempfile import SpooledTemporaryFile
 | |
| from unittest import mock
 | |
| 
 | |
| import pytest
 | |
| from PIL import Image
 | |
| from unstructured_inference.inference import layout
 | |
| 
 | |
| from unstructured.documents.coordinates import PixelSpace
 | |
| from unstructured.documents.elements import (
 | |
|     CoordinatesMetadata,
 | |
|     ElementMetadata,
 | |
|     NarrativeText,
 | |
|     Text,
 | |
|     Title,
 | |
| )
 | |
| from unstructured.partition import pdf, strategies
 | |
| 
 | |
| 
 | |
| class MockResponse:
 | |
|     def __init__(self, status_code, response):
 | |
|         self.status_code = status_code
 | |
|         self.response = response
 | |
| 
 | |
|     def json(self):
 | |
|         return self.response
 | |
| 
 | |
| 
 | |
| def mock_healthy_get(url, **kwargs):
 | |
|     return MockResponse(status_code=200, response={})
 | |
| 
 | |
| 
 | |
| def mock_unhealthy_get(url, **kwargs):
 | |
|     return MockResponse(status_code=500, response={})
 | |
| 
 | |
| 
 | |
| def mock_unsuccessful_post(url, **kwargs):
 | |
|     return MockResponse(status_code=500, response={})
 | |
| 
 | |
| 
 | |
| def mock_successful_post(url, **kwargs):
 | |
|     response = {
 | |
|         "pages": [
 | |
|             {
 | |
|                 "number": 0,
 | |
|                 "elements": [
 | |
|                     {"type": "Title", "text": "Charlie Brown and the Great Pumpkin"},
 | |
|                 ],
 | |
|             },
 | |
|             {
 | |
|                 "number": 1,
 | |
|                 "elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
 | |
|             },
 | |
|         ],
 | |
|     }
 | |
|     return MockResponse(status_code=200, response=response)
 | |
| 
 | |
| 
 | |
| class MockPageLayout(layout.PageLayout):
 | |
|     def __init__(self, number: int, image: Image):
 | |
|         self.number = number
 | |
|         self.image = image
 | |
| 
 | |
|     @property
 | |
|     def elements(self):
 | |
|         return [
 | |
|             layout.LayoutElement(
 | |
|                 type="Title",
 | |
|                 x1=0,
 | |
|                 y1=0,
 | |
|                 x2=2,
 | |
|                 y2=2,
 | |
|                 text="Charlie Brown and the Great Pumpkin",
 | |
|             ),
 | |
|         ]
 | |
| 
 | |
| 
 | |
| class MockDocumentLayout(layout.DocumentLayout):
 | |
|     @property
 | |
|     def pages(self):
 | |
|         return [
 | |
|             MockPageLayout(number=0, image=Image.new("1", (1, 1))),
 | |
|         ]
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     ("filename", "file"),
 | |
|     [("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")],
 | |
| )
 | |
| def test_partition_pdf_local(monkeypatch, filename, file):
 | |
|     monkeypatch.setattr(
 | |
|         layout,
 | |
|         "process_data_with_model",
 | |
|         lambda *args, **kwargs: MockDocumentLayout(),
 | |
|     )
 | |
|     monkeypatch.setattr(
 | |
|         layout,
 | |
|         "process_file_with_model",
 | |
|         lambda *args, **kwargs: MockDocumentLayout(),
 | |
|     )
 | |
| 
 | |
|     partition_pdf_response = pdf._partition_pdf_or_image_local(filename, file)
 | |
|     assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_local_raises_with_no_filename():
 | |
|     with pytest.raises(FileNotFoundError):
 | |
|         pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "strategy",
 | |
|     ["fast", "hi_res", "ocr_only"],
 | |
| )
 | |
| def test_partition_pdf_with_filename(
 | |
|     strategy,
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     # Test that the partition_pdf function can handle filename
 | |
|     result = pdf.partition_pdf(filename=filename, strategy=strategy)
 | |
|     # validate that the result is a non-empty list of dicts
 | |
|     assert len(result) > 10
 | |
|     # check that the pdf has multiple different page numbers
 | |
|     assert {element.metadata.page_number for element in result} == {1, 2}
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "strategy",
 | |
|     ["fast", "hi_res", "ocr_only"],
 | |
| )
 | |
| def test_partition_pdf_with_file_rb(
 | |
|     strategy,
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     # Test that the partition_pdf function can handle BufferedReader
 | |
|     with open(filename, "rb") as f:
 | |
|         result = pdf.partition_pdf(file=f, strategy=strategy)
 | |
|         # validate that the result is a non-empty list of dicts
 | |
|         assert len(result) > 10
 | |
|         # check that the pdf has multiple different page numbers
 | |
|         assert {element.metadata.page_number for element in result} == {1, 2}
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "strategy",
 | |
|     ["fast", "hi_res", "ocr_only"],
 | |
| )
 | |
| def test_partition_pdf_with_spooled_file(
 | |
|     strategy,
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     # Test that the partition_pdf function can handle a SpooledTemporaryFile
 | |
|     with open(filename, "rb") as test_file:
 | |
|         spooled_temp_file = SpooledTemporaryFile()
 | |
|         spooled_temp_file.write(test_file.read())
 | |
|         spooled_temp_file.seek(0)
 | |
|         result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
 | |
|         # validate that the result is a non-empty list of dicts
 | |
|         assert len(result) > 10
 | |
|         # check that the pdf has multiple different page numbers
 | |
|         assert {element.metadata.page_number for element in result} == {1, 2}
 | |
| 
 | |
| 
 | |
| @mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
 | |
| def test_partition_pdf_with_model_name_env_var(
 | |
|     monkeypatch,
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
 | |
|     with mock.patch.object(
 | |
|         layout,
 | |
|         "process_file_with_model",
 | |
|         mock.MagicMock(),
 | |
|     ) as mock_process:
 | |
|         pdf.partition_pdf(filename=filename, strategy="hi_res")
 | |
|         mock_process.assert_called_once_with(
 | |
|             filename,
 | |
|             is_image=False,
 | |
|             ocr_languages="eng",
 | |
|             extract_tables=False,
 | |
|             model_name="checkbox",
 | |
|         )
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_model_name(
 | |
|     monkeypatch,
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
 | |
|     with mock.patch.object(
 | |
|         layout,
 | |
|         "process_file_with_model",
 | |
|         mock.MagicMock(),
 | |
|     ) as mock_process:
 | |
|         pdf.partition_pdf(filename=filename, strategy="hi_res", model_name="checkbox")
 | |
|         mock_process.assert_called_once_with(
 | |
|             filename,
 | |
|             is_image=False,
 | |
|             ocr_languages="eng",
 | |
|             extract_tables=False,
 | |
|             model_name="checkbox",
 | |
|         )
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_auto_strategy(
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     elements = pdf.partition_pdf(filename=filename, strategy="auto")
 | |
|     title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
 | |
|     assert elements[0].text == title
 | |
|     assert elements[0].metadata.filename == "layout-parser-paper-fast.pdf"
 | |
|     assert elements[0].metadata.file_directory == "example-docs"
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_page_breaks(
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
 | |
|     assert "PageBreak" in [elem.category for elem in elements]
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_no_page_breaks(
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     elements = pdf.partition_pdf(filename=filename, url=None)
 | |
|     assert "PageBreak" not in [elem.category for elem in elements]
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_fast_strategy(
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
 | |
|     assert len(elements) > 10
 | |
|     # check that the pdf has multiple different page numbers
 | |
|     assert {element.metadata.page_number for element in elements} == {1, 2}
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "layout-parser-paper-fast.pdf"
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_fast_groups_text(
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
 | |
| 
 | |
|     first_narrative_element = None
 | |
|     for element in elements:
 | |
|         if isinstance(element, NarrativeText):
 | |
|             first_narrative_element = element
 | |
|             break
 | |
|     assert len(first_narrative_element.text) > 1000
 | |
|     assert first_narrative_element.text.startswith("Abstract. Recent advances")
 | |
|     assert first_narrative_element.text.endswith("https://layout-parser.github.io.")
 | |
|     assert first_narrative_element.metadata.filename == "layout-parser-paper-fast.pdf"
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_fast_strategy_from_file(
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = pdf.partition_pdf(file=f, url=None, strategy="fast")
 | |
|     assert len(elements) > 10
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_fast_strategy_and_page_breaks(
 | |
|     caplog,
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     elements = pdf.partition_pdf(
 | |
|         filename=filename,
 | |
|         url=None,
 | |
|         strategy="fast",
 | |
|         include_page_breaks=True,
 | |
|     )
 | |
|     assert len(elements) > 10
 | |
|     assert "PageBreak" in [elem.category for elem in elements]
 | |
| 
 | |
|     assert "unstructured_inference is not installed" not in caplog.text
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "layout-parser-paper-fast.pdf"
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_raises_with_bad_strategy(
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     with pytest.raises(ValueError):
 | |
|         pdf.partition_pdf(filename=filename, url=None, strategy="made_up")
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_falls_back_to_fast(
 | |
|     monkeypatch,
 | |
|     caplog,
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     def mock_exists(dep):
 | |
|         return dep not in ["unstructured_inference", "pytesseract"]
 | |
| 
 | |
|     monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
 | |
| 
 | |
|     mock_return = [Text("Hello there!")]
 | |
|     with mock.patch.object(
 | |
|         pdf,
 | |
|         "extractable_elements",
 | |
|         return_value=mock_return,
 | |
|     ) as mock_partition:
 | |
|         pdf.partition_pdf(filename=filename, url=None, strategy="hi_res")
 | |
| 
 | |
|     mock_partition.assert_called_once()
 | |
|     assert "unstructured_inference is not installed" in caplog.text
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_falls_back_to_fast_from_ocr_only(
 | |
|     monkeypatch,
 | |
|     caplog,
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     def mock_exists(dep):
 | |
|         return dep not in ["pytesseract"]
 | |
| 
 | |
|     monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
 | |
| 
 | |
|     mock_return = [Text("Hello there!")]
 | |
|     with mock.patch.object(
 | |
|         pdf,
 | |
|         "extractable_elements",
 | |
|         return_value=mock_return,
 | |
|     ) as mock_partition:
 | |
|         pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
 | |
| 
 | |
|     mock_partition.assert_called_once()
 | |
|     assert "pytesseract is not installed" in caplog.text
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
 | |
|     monkeypatch,
 | |
|     caplog,
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     def mock_exists(dep):
 | |
|         return dep not in ["pytesseract"]
 | |
| 
 | |
|     monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
 | |
|     monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
 | |
| 
 | |
|     mock_return = [Text("Hello there!")]
 | |
|     with mock.patch.object(
 | |
|         pdf,
 | |
|         "_partition_pdf_or_image_local",
 | |
|         return_value=mock_return,
 | |
|     ) as mock_partition:
 | |
|         pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
 | |
| 
 | |
|     mock_partition.assert_called_once()
 | |
|     assert "pytesseract is not installed" in caplog.text
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_falls_back_to_ocr_only(
 | |
|     monkeypatch,
 | |
|     caplog,
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     def mock_exists(dep):
 | |
|         return dep not in ["unstructured_inference"]
 | |
| 
 | |
|     monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
 | |
| 
 | |
|     mock_return = [Text("Hello there!")]
 | |
|     with mock.patch.object(
 | |
|         pdf,
 | |
|         "_partition_pdf_or_image_with_ocr",
 | |
|         return_value=mock_return,
 | |
|     ) as mock_partition:
 | |
|         pdf.partition_pdf(filename=filename, url=None, strategy="hi_res")
 | |
| 
 | |
|     mock_partition.assert_called_once()
 | |
|     assert "unstructured_inference is not installed" in caplog.text
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_uses_table_extraction():
 | |
|     filename = "example-docs/layout-parser-paper-fast.pdf"
 | |
|     with mock.patch(
 | |
|         "unstructured_inference.inference.layout.process_file_with_model",
 | |
|     ) as mock_process_file_with_model:
 | |
|         pdf.partition_pdf(filename, infer_table_structure=True)
 | |
|         assert mock_process_file_with_model.call_args[1]["extract_tables"]
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_copy_protection():
 | |
|     filename = os.path.join("example-docs", "copy-protected.pdf")
 | |
|     elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
 | |
|     elements[0] == Title(
 | |
|         "LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis",
 | |
|     )
 | |
|     # check that the pdf has multiple different page numbers
 | |
|     assert {element.metadata.page_number for element in elements} == {1, 2}
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_dpi():
 | |
|     filename = os.path.join("example-docs", "copy-protected.pdf")
 | |
|     with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
 | |
|         pdf.partition_pdf(filename=filename, strategy="hi_res", pdf_image_dpi=100)
 | |
|         mock_process.assert_called_once_with(
 | |
|             filename,
 | |
|             is_image=False,
 | |
|             ocr_languages="eng",
 | |
|             extract_tables=False,
 | |
|             model_name=None,
 | |
|             pdf_image_dpi=100,
 | |
|         )
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_requiring_recursive_text_grab(filename="example-docs/reliance.pdf"):
 | |
|     elements = pdf.partition_pdf(filename=filename, strategy="fast")
 | |
|     assert len(elements) > 50
 | |
|     assert elements[0].metadata.page_number == 1
 | |
|     assert elements[-1].metadata.page_number == 3
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog):
 | |
|     filename = os.path.join("example-docs", "loremipsum-flat.pdf")
 | |
|     elements = pdf.partition_pdf(filename=filename, strategy="fast")
 | |
|     elements[0] == Title(
 | |
|         "LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis",
 | |
|     )
 | |
|     assert "PDF text is not extractable" in caplog.text
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_fails_if_pdf_not_processable(
 | |
|     monkeypatch,
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     def mock_exists(dep):
 | |
|         return dep not in ["unstructured_inference", "pytesseract"]
 | |
| 
 | |
|     monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
 | |
|     monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
 | |
| 
 | |
|     with pytest.raises(ValueError):
 | |
|         pdf.partition_pdf(filename=filename)
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_fast_groups_text_in_text_box():
 | |
|     filename = os.path.join("example-docs", "chevron-page.pdf")
 | |
|     elements = pdf.partition_pdf(filename=filename, strategy="fast")
 | |
|     expected_coordinate_points_0 = (
 | |
|         (193.1741, 71.94000000000005),
 | |
|         (193.1741, 91.94000000000005),
 | |
|         (418.6881, 91.94000000000005),
 | |
|         (418.6881, 71.94000000000005),
 | |
|     )
 | |
|     expected_coordinate_system_0 = PixelSpace(width=612, height=792)
 | |
|     expected_elem_metadata_0 = ElementMetadata(
 | |
|         coordinates=CoordinatesMetadata(
 | |
|             points=expected_coordinate_points_0,
 | |
|             system=expected_coordinate_system_0,
 | |
|         ),
 | |
|     )
 | |
|     assert elements[0] == Title(
 | |
|         "eastern mediterranean",
 | |
|         metadata=expected_elem_metadata_0,
 | |
|     )
 | |
|     assert isinstance(elements[1], NarrativeText)
 | |
|     assert str(elements[1]).startswith("We")
 | |
|     assert str(elements[1]).endswith("Jordan and Egypt.")
 | |
| 
 | |
|     expected_coordinate_points_3 = (
 | |
|         (273.9929, 181.16470000000004),
 | |
|         (273.9929, 226.16470000000004),
 | |
|         (333.59990000000005, 226.16470000000004),
 | |
|         (333.59990000000005, 181.16470000000004),
 | |
|     )
 | |
|     expected_coordinate_system_3 = PixelSpace(width=612, height=792)
 | |
|     expected_elem_metadata_3 = ElementMetadata(
 | |
|         coordinates=CoordinatesMetadata(
 | |
|             points=expected_coordinate_points_3,
 | |
|             system=expected_coordinate_system_3,
 | |
|         ),
 | |
|     )
 | |
|     assert elements[3] == Title("1st", metadata=expected_elem_metadata_3)
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_metadata_filename(
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     elements = pdf.partition_pdf(
 | |
|         filename=filename,
 | |
|         url=None,
 | |
|         include_page_breaks=True,
 | |
|         metadata_filename="test",
 | |
|     )
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "test"
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_fast_strategy_from_file_with_metadata_filename(
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = pdf.partition_pdf(
 | |
|             file=f,
 | |
|             url=None,
 | |
|             strategy="fast",
 | |
|             metadata_filename="test",
 | |
|         )
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "test"
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_auto_strategy_exclude_metadata(
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     elements = pdf.partition_pdf(
 | |
|         filename=filename,
 | |
|         strategy="auto",
 | |
|         include_metadata=False,
 | |
|     )
 | |
|     title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
 | |
|     assert elements[0].text == title
 | |
|     for i in range(len(elements)):
 | |
|         assert elements[i].metadata.to_dict() == {}
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_fast_strategy_from_file_exclude_metadata(
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = pdf.partition_pdf(
 | |
|             file=f,
 | |
|             url=None,
 | |
|             strategy="fast",
 | |
|             include_metadata=False,
 | |
|         )
 | |
|     for i in range(len(elements)):
 | |
|         assert elements[i].metadata.to_dict() == {}
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_auto_strategy_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/copy-protected.pdf",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.pdf.get_last_modified_date",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     elements = pdf.partition_pdf(
 | |
|         filename=filename,
 | |
|     )
 | |
| 
 | |
|     assert elements[0].metadata.date == mocked_last_modification_date
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_auto_strategy_custom_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/copy-protected.pdf",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
|     expected_last_modification_date = "2020-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.pdf.get_last_modified_date",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     elements = pdf.partition_pdf(
 | |
|         filename=filename,
 | |
|         metadata_date=expected_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     assert elements[0].metadata.date == expected_last_modification_date
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_orc_only_strategy_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/copy-protected.pdf",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.pdf.get_last_modified_date",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     elements = pdf.partition_pdf(filename=filename, strategy="ocr_only")
 | |
| 
 | |
|     assert elements[0].metadata.date == mocked_last_modification_date
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_ocr_only_strategy_custom_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/copy-protected.pdf",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
|     expected_last_modification_date = "2020-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.pdf.get_last_modified_date",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     elements = pdf.partition_pdf(
 | |
|         filename=filename,
 | |
|         metadata_date=expected_last_modification_date,
 | |
|         strategy="ocr_only",
 | |
|     )
 | |
| 
 | |
|     assert elements[0].metadata.date == expected_last_modification_date
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_hi_res_strategy_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/copy-protected.pdf",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.pdf.get_last_modified_date",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
 | |
| 
 | |
|     assert elements[0].metadata.date == mocked_last_modification_date
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_hi_res_strategy_custom_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/copy-protected.pdf",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
|     expected_last_modification_date = "2020-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.pdf.get_last_modified_date",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     elements = pdf.partition_pdf(
 | |
|         filename=filename,
 | |
|         metadata_date=expected_last_modification_date,
 | |
|         strategy="hi_res",
 | |
|     )
 | |
| 
 | |
|     assert elements[0].metadata.date == expected_last_modification_date
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_from_file_with_auto_strategy_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/copy-protected.pdf",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.pdf.get_last_modified_date_from_file",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = pdf.partition_pdf(
 | |
|             file=f,
 | |
|         )
 | |
| 
 | |
|     assert elements[0].metadata.date == mocked_last_modification_date
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_from_file_with_auto_strategy_custom_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/copy-protected.pdf",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
|     expected_last_modification_date = "2020-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.pdf.get_last_modified_date_from_file",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = pdf.partition_pdf(
 | |
|             file=f,
 | |
|             metadata_date=expected_last_modification_date,
 | |
|         )
 | |
| 
 | |
|     assert elements[0].metadata.date == expected_last_modification_date
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_from_file_with_ocr_only_strategy_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/copy-protected.pdf",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.pdf.get_last_modified_date_from_file",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = pdf.partition_pdf(file=f, strategy="ocr_only")
 | |
| 
 | |
|     assert elements[0].metadata.date == mocked_last_modification_date
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_from_file_with_ocr_only_strategy_custom_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/copy-protected.pdf",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
|     expected_last_modification_date = "2020-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.pdf.get_last_modified_date_from_file",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = pdf.partition_pdf(
 | |
|             file=f,
 | |
|             metadata_date=expected_last_modification_date,
 | |
|             strategy="ocr_only",
 | |
|         )
 | |
| 
 | |
|     assert elements[0].metadata.date == expected_last_modification_date
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_from_file_with_hi_res_strategy_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/copy-protected.pdf",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.pdf.get_last_modified_date_from_file",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = pdf.partition_pdf(file=f, strategy="hi_res")
 | |
| 
 | |
|     assert elements[0].metadata.date == mocked_last_modification_date
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_from_file_with_hi_res_strategy_custom_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/copy-protected.pdf",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
|     expected_last_modification_date = "2020-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.pdf.get_last_modified_date_from_file",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = pdf.partition_pdf(
 | |
|             file=f,
 | |
|             metadata_date=expected_last_modification_date,
 | |
|             strategy="hi_res",
 | |
|         )
 | |
| 
 | |
|     assert elements[0].metadata.date == expected_last_modification_date
 |