2023-04-21 17:35:43 -04:00
|
|
|
import os
|
2023-05-09 21:39:07 -07:00
|
|
|
from tempfile import SpooledTemporaryFile
|
2023-02-27 17:30:54 +01:00
|
|
|
from unittest import mock
|
|
|
|
|
2022-11-30 16:34:24 -05:00
|
|
|
import pytest
|
2023-07-05 11:25:11 -07:00
|
|
|
from PIL import Image
|
2023-02-27 17:30:54 +01:00
|
|
|
from unstructured_inference.inference import layout
|
2022-11-30 16:34:24 -05:00
|
|
|
|
2023-09-11 16:00:14 -05:00
|
|
|
from unstructured.chunking.title import chunk_by_title
|
2023-06-20 11:19:55 -05:00
|
|
|
from unstructured.documents.coordinates import PixelSpace
|
2023-07-05 11:25:11 -07:00
|
|
|
from unstructured.documents.elements import (
|
|
|
|
CoordinatesMetadata,
|
|
|
|
ElementMetadata,
|
feat: pdf auto strategy groups broken numbered and bullet list items(#1393)
**Summary**
Adds logic to combine broken numbered list for pdf fast strategy.
**Details**
Previously the document reads the numbered list items part of the
`layout-parser-paper-fast.pdf` file as:
```
'1. An off-the-shelf toolkit for applying DL models for layout detection, character'
'recognition, and other DIA tasks (Section 3)'
'2. A rich repository of pre-trained neural network models (Model Zoo) that'
'underlies the off-the-shelf usage'
'3. Comprehensive tools for efficient document image data annotation and model'
'tuning to support different levels of customization'
'4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)'
```
Now it reads:
```
'1. An off-the-shelf toolkit for applying DL models for layout detection, character recognition, and other DIA tasks (Section 3)'
'2. A rich repository of pre-trained neural network models (Model Zoo) that underlies the off-the-shelf usage'
'3. Comprehensive tools for efficient document image data annotation and model' tuning to support different levels of customization'
'4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)'
```
The added logic leverages `ElementType` and `coordinates` to determine
whether the following lines is a part of the previously detected
`ListItem` or not.
**Test**
Add test that checks the element length less than original version with
broken numbered list. The test also checks whether the first detected
numbered list ends with previously broken line.
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-13 17:30:06 -04:00
|
|
|
ListItem,
|
2023-07-05 11:25:11 -07:00
|
|
|
NarrativeText,
|
|
|
|
Text,
|
|
|
|
Title,
|
|
|
|
)
|
2023-05-08 13:21:24 -04:00
|
|
|
from unstructured.partition import pdf, strategies
|
2023-08-29 16:59:26 -04:00
|
|
|
from unstructured.partition.json import partition_json
|
2023-10-05 15:26:47 -05:00
|
|
|
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
2023-08-29 16:59:26 -04:00
|
|
|
from unstructured.staging.base import elements_to_json
|
2022-11-21 17:27:23 -05:00
|
|
|
|
|
|
|
|
2022-11-30 16:34:24 -05:00
|
|
|
class MockResponse:
|
|
|
|
def __init__(self, status_code, response):
|
|
|
|
self.status_code = status_code
|
|
|
|
self.response = response
|
|
|
|
|
|
|
|
def json(self):
|
|
|
|
return self.response
|
|
|
|
|
|
|
|
|
|
|
|
def mock_healthy_get(url, **kwargs):
|
|
|
|
return MockResponse(status_code=200, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_unhealthy_get(url, **kwargs):
|
|
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_unsuccessful_post(url, **kwargs):
|
|
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_successful_post(url, **kwargs):
|
|
|
|
response = {
|
|
|
|
"pages": [
|
|
|
|
{
|
|
|
|
"number": 0,
|
2023-05-31 13:50:15 -05:00
|
|
|
"elements": [
|
|
|
|
{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"},
|
|
|
|
],
|
2023-02-08 10:11:15 -05:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"number": 1,
|
|
|
|
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
|
|
|
|
},
|
2023-02-27 17:30:54 +01:00
|
|
|
],
|
2022-11-30 16:34:24 -05:00
|
|
|
}
|
|
|
|
return MockResponse(status_code=200, response=response)
|
|
|
|
|
|
|
|
|
2023-01-04 16:19:05 -06:00
|
|
|
class MockPageLayout(layout.PageLayout):
|
2023-07-05 11:25:11 -07:00
|
|
|
def __init__(self, number: int, image: Image):
|
|
|
|
self.number = number
|
|
|
|
self.image = image
|
2023-01-04 16:19:05 -06:00
|
|
|
|
|
|
|
@property
|
|
|
|
def elements(self):
|
|
|
|
return [
|
|
|
|
layout.LayoutElement(
|
|
|
|
type="Title",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=0,
|
|
|
|
y1=0,
|
|
|
|
x2=2,
|
|
|
|
y2=2,
|
2023-01-04 16:19:05 -06:00
|
|
|
text="Charlie Brown and the Great Pumpkin",
|
2023-02-27 17:30:54 +01:00
|
|
|
),
|
2023-01-04 16:19:05 -06:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class MockDocumentLayout(layout.DocumentLayout):
|
|
|
|
@property
|
|
|
|
def pages(self):
|
|
|
|
return [
|
2023-07-05 11:25:11 -07:00
|
|
|
MockPageLayout(number=0, image=Image.new("1", (1, 1))),
|
2023-01-04 16:19:05 -06:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
("filename", "file"),
|
|
|
|
[("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")],
|
2023-01-04 16:19:05 -06:00
|
|
|
)
|
|
|
|
def test_partition_pdf_local(monkeypatch, filename, file):
|
2023-01-13 22:24:13 -06:00
|
|
|
monkeypatch.setattr(
|
2023-02-27 17:30:54 +01:00
|
|
|
layout,
|
|
|
|
"process_data_with_model",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
2023-01-13 22:24:13 -06:00
|
|
|
)
|
2023-01-04 16:19:05 -06:00
|
|
|
monkeypatch.setattr(
|
2023-02-27 17:30:54 +01:00
|
|
|
layout,
|
|
|
|
"process_file_with_model",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
2023-01-04 16:19:05 -06:00
|
|
|
)
|
|
|
|
|
2023-01-13 22:24:13 -06:00
|
|
|
partition_pdf_response = pdf._partition_pdf_or_image_local(filename, file)
|
2023-01-04 16:19:05 -06:00
|
|
|
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
|
|
|
|
|
|
|
|
|
2023-01-13 22:24:13 -06:00
|
|
|
def test_partition_pdf_local_raises_with_no_filename():
|
2022-11-30 16:34:24 -05:00
|
|
|
with pytest.raises(FileNotFoundError):
|
2023-01-13 22:24:13 -06:00
|
|
|
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
|
2022-11-30 16:34:24 -05:00
|
|
|
|
|
|
|
|
2023-08-30 18:34:55 -05:00
|
|
|
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
|
2023-05-09 21:39:07 -07:00
|
|
|
@pytest.mark.parametrize(
|
2023-10-05 15:26:47 -05:00
|
|
|
("strategy", "expected", "origin"),
|
2023-08-30 18:34:55 -05:00
|
|
|
# fast: can't capture the "intentionally left blank page" page
|
|
|
|
# others: will ignore the actual blank page
|
2023-10-05 15:26:47 -05:00
|
|
|
[("fast", {1, 4}, "pdfminer"), ("hi_res", {1, 3, 4}, "pdf"), ("ocr_only", {1, 3, 4}, "OCR")],
|
2023-07-14 13:08:33 -07:00
|
|
|
)
|
2023-08-30 18:34:55 -05:00
|
|
|
def test_partition_pdf(
|
|
|
|
file_mode,
|
2023-07-14 13:08:33 -07:00
|
|
|
strategy,
|
2023-08-30 18:34:55 -05:00
|
|
|
expected,
|
2023-10-05 15:26:47 -05:00
|
|
|
origin,
|
2023-09-01 17:33:06 -05:00
|
|
|
filename="example-docs/layout-parser-paper-with-empty-pages.pdf",
|
2023-07-14 13:08:33 -07:00
|
|
|
):
|
|
|
|
# Test that the partition_pdf function can handle filename
|
2023-08-30 18:34:55 -05:00
|
|
|
def _test(result):
|
2023-05-09 21:39:07 -07:00
|
|
|
# validate that the result is a non-empty list of dicts
|
|
|
|
assert len(result) > 10
|
2023-05-30 15:10:14 -04:00
|
|
|
# check that the pdf has multiple different page numbers
|
2023-08-30 18:34:55 -05:00
|
|
|
assert {element.metadata.page_number for element in result} == expected
|
2023-10-05 15:26:47 -05:00
|
|
|
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
|
|
|
|
assert {element.metadata.detection_origin for element in result} == {origin}
|
2023-08-30 18:34:55 -05:00
|
|
|
|
|
|
|
if file_mode == "filename":
|
|
|
|
result = pdf.partition_pdf(filename=filename, strategy=strategy)
|
|
|
|
_test(result)
|
|
|
|
elif file_mode == "rb":
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
result = pdf.partition_pdf(file=f, strategy=strategy)
|
|
|
|
_test(result)
|
|
|
|
else:
|
|
|
|
with open(filename, "rb") as test_file:
|
|
|
|
spooled_temp_file = SpooledTemporaryFile()
|
|
|
|
spooled_temp_file.write(test_file.read())
|
|
|
|
spooled_temp_file.seek(0)
|
|
|
|
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
|
|
|
|
_test(result)
|
2023-05-09 21:39:07 -07:00
|
|
|
|
|
|
|
|
2023-06-27 23:06:08 -05:00
|
|
|
@mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
|
2023-07-07 11:16:55 -04:00
|
|
|
def test_partition_pdf_with_model_name_env_var(
|
2023-05-31 13:50:15 -05:00
|
|
|
monkeypatch,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-07-07 23:41:37 -05:00
|
|
|
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
2023-07-26 15:10:14 -04:00
|
|
|
with mock.patch.object(
|
|
|
|
layout,
|
|
|
|
"process_file_with_model",
|
|
|
|
mock.MagicMock(),
|
|
|
|
) as mock_process:
|
2023-07-07 23:41:37 -05:00
|
|
|
pdf.partition_pdf(filename=filename, strategy="hi_res")
|
2023-06-27 23:06:08 -05:00
|
|
|
mock_process.assert_called_once_with(
|
|
|
|
filename,
|
|
|
|
is_image=False,
|
|
|
|
ocr_languages="eng",
|
2023-08-22 19:05:02 -04:00
|
|
|
ocr_mode="entire_page",
|
2023-06-27 23:06:08 -05:00
|
|
|
extract_tables=False,
|
|
|
|
model_name="checkbox",
|
|
|
|
)
|
2023-02-08 10:11:15 -05:00
|
|
|
|
|
|
|
|
2023-07-07 11:16:55 -04:00
|
|
|
def test_partition_pdf_with_model_name(
|
|
|
|
monkeypatch,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-07-07 23:41:37 -05:00
|
|
|
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
2023-07-26 15:10:14 -04:00
|
|
|
with mock.patch.object(
|
|
|
|
layout,
|
|
|
|
"process_file_with_model",
|
|
|
|
mock.MagicMock(),
|
|
|
|
) as mock_process:
|
2023-07-07 11:16:55 -04:00
|
|
|
pdf.partition_pdf(filename=filename, strategy="hi_res", model_name="checkbox")
|
|
|
|
mock_process.assert_called_once_with(
|
|
|
|
filename,
|
|
|
|
is_image=False,
|
|
|
|
ocr_languages="eng",
|
2023-08-22 19:05:02 -04:00
|
|
|
ocr_mode="entire_page",
|
2023-07-07 11:16:55 -04:00
|
|
|
extract_tables=False,
|
|
|
|
model_name="checkbox",
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
def test_partition_pdf_with_auto_strategy(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-05-12 13:45:08 -04:00
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="auto")
|
|
|
|
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
2023-10-05 00:41:38 -07:00
|
|
|
assert elements[7].text == title
|
|
|
|
assert elements[7].metadata.filename == "layout-parser-paper-fast.pdf"
|
|
|
|
assert elements[7].metadata.file_directory == "example-docs"
|
2023-05-12 13:45:08 -04:00
|
|
|
|
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
def test_partition_pdf_with_page_breaks(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-02-08 10:11:15 -05:00
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
|
2023-06-28 23:14:05 -04:00
|
|
|
assert "PageBreak" in [elem.category for elem in elements]
|
2023-02-08 10:11:15 -05:00
|
|
|
|
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
def test_partition_pdf_with_no_page_breaks(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-02-08 10:11:15 -05:00
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None)
|
2023-06-28 23:14:05 -04:00
|
|
|
assert "PageBreak" not in [elem.category for elem in elements]
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
def test_partition_pdf_with_fast_strategy(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-03-10 22:16:05 -05:00
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
|
|
|
|
assert len(elements) > 10
|
2023-05-30 15:10:14 -04:00
|
|
|
# check that the pdf has multiple different page numbers
|
2023-09-01 17:33:06 -05:00
|
|
|
assert {element.metadata.page_number for element in elements} == {1, 2}
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "layout-parser-paper-fast.pdf"
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
|
2023-09-19 19:25:31 -07:00
|
|
|
def test_partition_pdf_with_fast_neg_coordinates():
|
|
|
|
filename = "example-docs/negative-coords.pdf"
|
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
|
|
|
|
assert len(elements) == 5
|
|
|
|
assert elements[0].metadata.coordinates.points[0][0] < 0
|
|
|
|
assert elements[0].metadata.coordinates.points[1][0] < 0
|
|
|
|
|
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
def test_partition_pdf_with_fast_groups_text(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-04-19 13:54:17 -04:00
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
|
|
|
|
|
|
|
|
first_narrative_element = None
|
|
|
|
for element in elements:
|
|
|
|
if isinstance(element, NarrativeText):
|
|
|
|
first_narrative_element = element
|
|
|
|
break
|
|
|
|
assert len(first_narrative_element.text) > 1000
|
|
|
|
assert first_narrative_element.text.startswith("Abstract. Recent advances")
|
|
|
|
assert first_narrative_element.text.endswith("https://layout-parser.github.io.")
|
2023-07-05 15:02:22 -05:00
|
|
|
assert first_narrative_element.metadata.filename == "layout-parser-paper-fast.pdf"
|
2023-04-19 13:54:17 -04:00
|
|
|
|
|
|
|
|
2023-03-10 22:16:05 -05:00
|
|
|
def test_partition_pdf_with_fast_strategy_from_file(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = pdf.partition_pdf(file=f, url=None, strategy="fast")
|
|
|
|
assert len(elements) > 10
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_fast_strategy_and_page_breaks(
|
2023-04-13 11:46:35 -04:00
|
|
|
caplog,
|
2023-03-10 22:16:05 -05:00
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
|
|
|
url=None,
|
|
|
|
strategy="fast",
|
|
|
|
include_page_breaks=True,
|
|
|
|
)
|
|
|
|
assert len(elements) > 10
|
2023-06-28 23:14:05 -04:00
|
|
|
assert "PageBreak" in [elem.category for elem in elements]
|
2023-03-10 22:16:05 -05:00
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
assert "unstructured_inference is not installed" not in caplog.text
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "layout-parser-paper-fast.pdf"
|
2023-04-13 11:46:35 -04:00
|
|
|
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
def test_partition_pdf_raises_with_bad_strategy(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="made_up")
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_fast(
|
|
|
|
monkeypatch,
|
2023-04-13 11:46:35 -04:00
|
|
|
caplog,
|
2023-03-10 22:16:05 -05:00
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-05-08 13:21:24 -04:00
|
|
|
def mock_exists(dep):
|
2023-05-31 13:50:15 -05:00
|
|
|
return dep not in ["unstructured_inference", "pytesseract"]
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
|
|
|
|
|
|
|
mock_return = [Text("Hello there!")]
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
2023-07-07 23:41:37 -05:00
|
|
|
"extractable_elements",
|
2023-05-08 13:21:24 -04:00
|
|
|
return_value=mock_return,
|
|
|
|
) as mock_partition:
|
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="hi_res")
|
|
|
|
|
|
|
|
mock_partition.assert_called_once()
|
2023-05-31 13:50:15 -05:00
|
|
|
assert "unstructured_inference is not installed" in caplog.text
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_fast_from_ocr_only(
|
|
|
|
monkeypatch,
|
|
|
|
caplog,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
def mock_exists(dep):
|
|
|
|
return dep not in ["pytesseract"]
|
|
|
|
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
mock_return = [Text("Hello there!")]
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
2023-07-07 23:41:37 -05:00
|
|
|
"extractable_elements",
|
2023-03-10 22:16:05 -05:00
|
|
|
return_value=mock_return,
|
2023-08-23 03:43:33 +01:00
|
|
|
) as mock_partition, mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_or_image_with_ocr",
|
|
|
|
) as mock_partition_ocr:
|
2023-05-08 13:21:24 -04:00
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
|
|
|
|
|
|
|
|
mock_partition.assert_called_once()
|
2023-08-23 03:43:33 +01:00
|
|
|
mock_partition_ocr.assert_not_called()
|
2023-05-08 13:21:24 -04:00
|
|
|
assert "pytesseract is not installed" in caplog.text
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
|
|
|
|
monkeypatch,
|
|
|
|
caplog,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
def mock_exists(dep):
|
|
|
|
return dep not in ["pytesseract"]
|
|
|
|
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
2023-07-07 23:41:37 -05:00
|
|
|
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
mock_return = [Text("Hello there!")]
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_or_image_local",
|
|
|
|
return_value=mock_return,
|
|
|
|
) as mock_partition:
|
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
|
|
|
|
|
|
|
|
mock_partition.assert_called_once()
|
|
|
|
assert "pytesseract is not installed" in caplog.text
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_ocr_only(
|
|
|
|
monkeypatch,
|
|
|
|
caplog,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
def mock_exists(dep):
|
2023-05-31 13:50:15 -05:00
|
|
|
return dep not in ["unstructured_inference"]
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
|
|
|
|
|
|
|
mock_return = [Text("Hello there!")]
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_or_image_with_ocr",
|
|
|
|
return_value=mock_return,
|
2023-03-10 22:16:05 -05:00
|
|
|
) as mock_partition:
|
2023-04-13 11:46:35 -04:00
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="hi_res")
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
mock_partition.assert_called_once()
|
2023-05-31 13:50:15 -05:00
|
|
|
assert "unstructured_inference is not installed" in caplog.text
|
2023-04-21 12:01:29 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_uses_table_extraction():
|
|
|
|
filename = "example-docs/layout-parser-paper-fast.pdf"
|
|
|
|
with mock.patch(
|
|
|
|
"unstructured_inference.inference.layout.process_file_with_model",
|
|
|
|
) as mock_process_file_with_model:
|
2023-04-21 13:48:19 -05:00
|
|
|
pdf.partition_pdf(filename, infer_table_structure=True)
|
2023-04-21 12:01:29 -05:00
|
|
|
assert mock_process_file_with_model.call_args[1]["extract_tables"]
|
2023-04-21 17:35:43 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_copy_protection():
|
|
|
|
filename = os.path.join("example-docs", "copy-protected.pdf")
|
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
|
2023-10-05 00:41:38 -07:00
|
|
|
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
|
|
|
idx = 3
|
|
|
|
assert elements[idx].text == title
|
2023-06-15 12:21:17 -04:00
|
|
|
assert {element.metadata.page_number for element in elements} == {1, 2}
|
2023-10-05 00:41:38 -07:00
|
|
|
assert elements[idx].metadata.detection_class_prob is not None
|
|
|
|
assert isinstance(elements[idx].metadata.detection_class_prob, float)
|
2023-07-26 09:26:06 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_dpi():
|
|
|
|
filename = os.path.join("example-docs", "copy-protected.pdf")
|
|
|
|
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
|
|
|
|
pdf.partition_pdf(filename=filename, strategy="hi_res", pdf_image_dpi=100)
|
|
|
|
mock_process.assert_called_once_with(
|
|
|
|
filename,
|
|
|
|
is_image=False,
|
|
|
|
ocr_languages="eng",
|
2023-08-22 19:05:02 -04:00
|
|
|
ocr_mode="entire_page",
|
2023-07-26 09:26:06 -04:00
|
|
|
extract_tables=False,
|
2023-10-03 22:28:47 -05:00
|
|
|
model_name=pdf.default_hi_res_model(),
|
2023-07-26 09:26:06 -04:00
|
|
|
pdf_image_dpi=100,
|
|
|
|
)
|
2023-04-21 17:35:43 -04:00
|
|
|
|
|
|
|
|
2023-06-22 11:19:54 -04:00
|
|
|
def test_partition_pdf_requiring_recursive_text_grab(filename="example-docs/reliance.pdf"):
|
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
|
|
|
assert len(elements) > 50
|
|
|
|
assert elements[0].metadata.page_number == 1
|
|
|
|
assert elements[-1].metadata.page_number == 3
|
|
|
|
|
|
|
|
|
2023-04-21 17:35:43 -04:00
|
|
|
def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog):
|
2023-07-07 23:41:37 -05:00
|
|
|
filename = os.path.join("example-docs", "loremipsum-flat.pdf")
|
2023-04-21 17:35:43 -04:00
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
2023-05-31 13:50:15 -05:00
|
|
|
elements[0] == Title(
|
|
|
|
"LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis",
|
|
|
|
)
|
2023-04-21 17:35:43 -04:00
|
|
|
assert "PDF text is not extractable" in caplog.text
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_fails_if_pdf_not_processable(
|
|
|
|
monkeypatch,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-05-08 13:21:24 -04:00
|
|
|
def mock_exists(dep):
|
2023-05-31 13:50:15 -05:00
|
|
|
return dep not in ["unstructured_inference", "pytesseract"]
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
2023-07-07 23:41:37 -05:00
|
|
|
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
2023-04-21 17:35:43 -04:00
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
pdf.partition_pdf(filename=filename)
|
2023-05-03 18:33:24 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_fast_groups_text_in_text_box():
|
|
|
|
filename = os.path.join("example-docs", "chevron-page.pdf")
|
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
2023-07-05 11:25:11 -07:00
|
|
|
expected_coordinate_points_0 = (
|
|
|
|
(193.1741, 71.94000000000005),
|
|
|
|
(193.1741, 91.94000000000005),
|
|
|
|
(418.6881, 91.94000000000005),
|
|
|
|
(418.6881, 71.94000000000005),
|
|
|
|
)
|
|
|
|
expected_coordinate_system_0 = PixelSpace(width=612, height=792)
|
|
|
|
expected_elem_metadata_0 = ElementMetadata(
|
|
|
|
coordinates=CoordinatesMetadata(
|
|
|
|
points=expected_coordinate_points_0,
|
|
|
|
system=expected_coordinate_system_0,
|
2023-05-20 16:26:55 -05:00
|
|
|
),
|
|
|
|
)
|
2023-07-26 15:10:14 -04:00
|
|
|
assert elements[0] == Title(
|
|
|
|
"eastern mediterranean",
|
|
|
|
metadata=expected_elem_metadata_0,
|
|
|
|
)
|
2023-05-03 18:33:24 -04:00
|
|
|
assert isinstance(elements[1], NarrativeText)
|
|
|
|
assert str(elements[1]).startswith("We")
|
|
|
|
assert str(elements[1]).endswith("Jordan and Egypt.")
|
|
|
|
|
2023-07-05 11:25:11 -07:00
|
|
|
expected_coordinate_points_3 = (
|
2023-08-24 17:46:19 -07:00
|
|
|
(95.6683, 181.16470000000004),
|
|
|
|
(95.6683, 226.16470000000004),
|
|
|
|
(166.7908, 226.16470000000004),
|
|
|
|
(166.7908, 181.16470000000004),
|
2023-07-05 11:25:11 -07:00
|
|
|
)
|
|
|
|
expected_coordinate_system_3 = PixelSpace(width=612, height=792)
|
|
|
|
expected_elem_metadata_3 = ElementMetadata(
|
|
|
|
coordinates=CoordinatesMetadata(
|
|
|
|
points=expected_coordinate_points_3,
|
|
|
|
system=expected_coordinate_system_3,
|
2023-05-20 16:26:55 -05:00
|
|
|
),
|
2023-05-03 18:33:24 -04:00
|
|
|
)
|
2023-09-28 20:48:02 -07:00
|
|
|
assert elements[2] == Text("2.5", metadata=expected_elem_metadata_3)
|
2023-06-30 09:44:46 -05:00
|
|
|
|
|
|
|
|
2023-07-05 15:02:22 -05:00
|
|
|
def test_partition_pdf_with_metadata_filename(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
|
|
|
url=None,
|
|
|
|
include_page_breaks=True,
|
|
|
|
metadata_filename="test",
|
|
|
|
)
|
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "test"
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_fast_strategy_from_file_with_metadata_filename(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
with open(filename, "rb") as f:
|
2023-07-26 15:10:14 -04:00
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
file=f,
|
|
|
|
url=None,
|
|
|
|
strategy="fast",
|
|
|
|
metadata_filename="test",
|
|
|
|
)
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "test"
|
|
|
|
|
|
|
|
|
2023-06-30 09:44:46 -05:00
|
|
|
def test_partition_pdf_with_auto_strategy_exclude_metadata(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-07-26 15:10:14 -04:00
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
|
|
|
strategy="auto",
|
|
|
|
include_metadata=False,
|
|
|
|
)
|
2023-06-30 09:44:46 -05:00
|
|
|
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
2023-10-05 00:41:38 -07:00
|
|
|
assert elements[7].text == title
|
2023-06-30 09:44:46 -05:00
|
|
|
for i in range(len(elements)):
|
|
|
|
assert elements[i].metadata.to_dict() == {}
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_fast_strategy_from_file_exclude_metadata(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
with open(filename, "rb") as f:
|
2023-07-26 15:10:14 -04:00
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
file=f,
|
|
|
|
url=None,
|
|
|
|
strategy="fast",
|
|
|
|
include_metadata=False,
|
|
|
|
)
|
2023-06-30 09:44:46 -05:00
|
|
|
for i in range(len(elements)):
|
|
|
|
assert elements[i].metadata.to_dict() == {}
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_auto_strategy_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/copy-protected.pdf",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_auto_strategy_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/copy-protected.pdf",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_orc_only_strategy_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/copy-protected.pdf",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="ocr_only")
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_ocr_only_strategy_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/copy-protected.pdf",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
strategy="ocr_only",
|
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_hi_res_strategy_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/copy-protected.pdf",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_hi_res_strategy_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/copy-protected.pdf",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
strategy="hi_res",
|
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_from_file_with_auto_strategy_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/copy-protected.pdf",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
file=f,
|
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_from_file_with_auto_strategy_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/copy-protected.pdf",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
file=f,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_from_file_with_ocr_only_strategy_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/copy-protected.pdf",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = pdf.partition_pdf(file=f, strategy="ocr_only")
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_from_file_with_ocr_only_strategy_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/copy-protected.pdf",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
file=f,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
strategy="ocr_only",
|
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_from_file_with_hi_res_strategy_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/copy-protected.pdf",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = pdf.partition_pdf(file=f, strategy="hi_res")
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_from_file_with_hi_res_strategy_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/copy-protected.pdf",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
file=f,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
strategy="hi_res",
|
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-08-25 00:32:12 -05:00
|
|
|
|
|
|
|
|
2023-08-29 16:59:26 -04:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"strategy",
|
|
|
|
["fast", "hi_res"],
|
|
|
|
)
|
|
|
|
def test_partition_pdf_with_json(
|
|
|
|
strategy,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy=strategy)
|
|
|
|
test_elements = partition_json(text=elements_to_json(elements))
|
|
|
|
|
|
|
|
assert len(elements) == len(test_elements)
|
|
|
|
|
|
|
|
for i in range(len(elements)):
|
|
|
|
assert elements[i] == test_elements[i]
|
|
|
|
|
|
|
|
|
2023-08-25 00:32:12 -05:00
|
|
|
def test_partition_pdf_with_ocr_has_coordinates_from_filename(
|
|
|
|
filename="example-docs/chevron-page.pdf",
|
|
|
|
):
|
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="ocr_only")
|
fix: return ocr coordinates points as tuple (#1219)
The `add_pytesseract_bbox_to_elements` returned the
`metadata.coordinates.points` as `Tuple` whereas other strategies
returned as `List`. Make change accordingly for consistency.
Previously:
```
element.metadata.coordinates.points = [
(x1, y1),
(x2, y2),
(x3, y3),
(x4, y4),
]
```
Currently:
```
element.metadata.coordinates.points = (
(x1, y1),
(x2, y2),
(x3, y3),
(x4, y4),
)
```
2023-08-28 13:31:55 -04:00
|
|
|
assert elements[0].metadata.coordinates.points == (
|
2023-08-25 00:32:12 -05:00
|
|
|
(657.0, 2144.0),
|
|
|
|
(657.0, 2106.0),
|
|
|
|
(1043.0, 2106.0),
|
|
|
|
(1043.0, 2144.0),
|
fix: return ocr coordinates points as tuple (#1219)
The `add_pytesseract_bbox_to_elements` returned the
`metadata.coordinates.points` as `Tuple` whereas other strategies
returned as `List`. Make change accordingly for consistency.
Previously:
```
element.metadata.coordinates.points = [
(x1, y1),
(x2, y2),
(x3, y3),
(x4, y4),
]
```
Currently:
```
element.metadata.coordinates.points = (
(x1, y1),
(x2, y2),
(x3, y3),
(x4, y4),
)
```
2023-08-28 13:31:55 -04:00
|
|
|
)
|
2023-08-25 00:32:12 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_ocr_has_coordinates_from_file(
|
|
|
|
filename="example-docs/chevron-page.pdf",
|
|
|
|
):
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
file=f,
|
|
|
|
strategy="ocr_only",
|
|
|
|
)
|
fix: return ocr coordinates points as tuple (#1219)
The `add_pytesseract_bbox_to_elements` returned the
`metadata.coordinates.points` as `Tuple` whereas other strategies
returned as `List`. Make change accordingly for consistency.
Previously:
```
element.metadata.coordinates.points = [
(x1, y1),
(x2, y2),
(x3, y3),
(x4, y4),
]
```
Currently:
```
element.metadata.coordinates.points = (
(x1, y1),
(x2, y2),
(x3, y3),
(x4, y4),
)
```
2023-08-28 13:31:55 -04:00
|
|
|
assert elements[0].metadata.coordinates.points == (
|
2023-08-25 00:32:12 -05:00
|
|
|
(657.0, 2144.0),
|
|
|
|
(657.0, 2106.0),
|
|
|
|
(1043.0, 2106.0),
|
|
|
|
(1043.0, 2144.0),
|
fix: return ocr coordinates points as tuple (#1219)
The `add_pytesseract_bbox_to_elements` returned the
`metadata.coordinates.points` as `Tuple` whereas other strategies
returned as `List`. Make change accordingly for consistency.
Previously:
```
element.metadata.coordinates.points = [
(x1, y1),
(x2, y2),
(x3, y3),
(x4, y4),
]
```
Currently:
```
element.metadata.coordinates.points = (
(x1, y1),
(x2, y2),
(x3, y3),
(x4, y4),
)
```
2023-08-28 13:31:55 -04:00
|
|
|
)
|
2023-09-11 16:00:14 -05:00
|
|
|
|
|
|
|
|
2023-09-15 15:11:16 -05:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("filename"),
|
|
|
|
[
|
|
|
|
("example-docs/multi-column-2p.pdf"),
|
|
|
|
("example-docs/layout-parser-paper-fast.pdf"),
|
|
|
|
("example-docs/list-item-example.pdf"),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_partition_pdf_with_ocr_coordinates_are_not_nan_from_file(
|
|
|
|
filename,
|
|
|
|
):
|
|
|
|
import math
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
file=f,
|
|
|
|
strategy="ocr_only",
|
|
|
|
)
|
|
|
|
for element in elements:
|
|
|
|
if element.metadata.coordinates:
|
|
|
|
for point in element.metadata.coordinates.points:
|
|
|
|
if point[0] and point[1]:
|
|
|
|
assert point[0] is not math.nan
|
|
|
|
assert point[1] is not math.nan
|
|
|
|
|
|
|
|
|
2023-10-03 09:40:34 -07:00
|
|
|
def test_add_chunking_strategy_by_title_on_partition_pdf(
|
2023-09-11 16:00:14 -05:00
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
elements = pdf.partition_pdf(filename=filename)
|
|
|
|
chunk_elements = pdf.partition_pdf(filename, chunking_strategy="by_title")
|
|
|
|
chunks = chunk_by_title(elements)
|
|
|
|
assert chunk_elements != elements
|
|
|
|
assert chunk_elements == chunks
|
2023-09-12 12:15:26 -04:00
|
|
|
|
|
|
|
|
2023-09-18 11:42:02 -04:00
|
|
|
def test_partition_pdf_formats_languages_for_tesseract():
|
|
|
|
filename = "example-docs/DA-1p.pdf"
|
|
|
|
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
|
|
|
|
pdf.partition_pdf(filename=filename, strategy="hi_res", languages=["en"])
|
|
|
|
mock_process.assert_called_once_with(
|
|
|
|
filename,
|
|
|
|
is_image=False,
|
|
|
|
ocr_languages="eng",
|
|
|
|
ocr_mode="entire_page",
|
|
|
|
extract_tables=False,
|
2023-10-03 22:28:47 -05:00
|
|
|
model_name=pdf.default_hi_res_model(),
|
2023-09-18 11:42:02 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-09-12 12:15:26 -04:00
|
|
|
def test_partition_pdf_warns_with_ocr_languages(caplog):
|
|
|
|
filename = "example-docs/chevron-page.pdf"
|
|
|
|
pdf.partition_pdf(filename=filename, strategy="hi_res", ocr_languages="eng")
|
|
|
|
assert "The ocr_languages kwarg will be deprecated" in caplog.text
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_or_image_warns_with_ocr_languages(caplog):
|
|
|
|
filename = "example-docs/DA-1p.pdf"
|
|
|
|
pdf.partition_pdf_or_image(filename=filename, strategy="hi_res", ocr_languages="eng")
|
|
|
|
assert "The ocr_languages kwarg will be deprecated" in caplog.text
|
2023-09-12 15:32:48 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_categorization_backup():
|
feat: get embedded url, associate text and start index for pdf (#1539)
**Executive Summary**
Adds PDF functionality to capture hyperlink (external or internal) for
pdf fast strategy along with associate text.
**Technical Details**
- `pdfminer` associates `annotation` (links and uris) with bounding box
rather than text. Therefore, the link and text matching is not a perfect
pair but rather a logic-based and calculation matching from bounding box
overlapping.
- There is no word-level bounding box. Only character-level (access
using `LTChar`). Thus in order to get to word-level, there is a window
slicing through the text. The words are captured in alphanumeric and
non-alphanumeric separately, meaning it will split the word if contains
both, on the first encounter of non-alphanumeric.)
- The bounding box calculation is calculated using start and stop
coordinates for the corresponding word calculated from above. The
calculation is simply using distance between two dots.
The result now contains `links` in `metadata` as shown below:
```
"links": [
{
"text": "link",
"url": "https://github.com/Unstructured-IO/unstructured",
"start_index": 12
},
{
"text": "email",
"url": "mailto:unstructuredai@earlygrowth.com",
"start_index": 30
},
{
"text": "phone number",
"url": "tel:6505124019",
"start_index": 49
}
]
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-27 13:43:32 -04:00
|
|
|
text = "This is Clearly a Title"
|
2023-09-12 15:32:48 -05:00
|
|
|
with mock.patch.object(pdf, "_partition_pdf_or_image_local", return_value=[Text(text)]):
|
|
|
|
elements = pdf.partition_pdf_or_image(
|
|
|
|
"example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
strategy="hi_res",
|
|
|
|
)
|
|
|
|
# Should have changed the element class from Text to Title
|
|
|
|
assert isinstance(elements[0], Title)
|
|
|
|
assert elements[0].text == text
|
feat: pdf auto strategy groups broken numbered and bullet list items(#1393)
**Summary**
Adds logic to combine broken numbered list for pdf fast strategy.
**Details**
Previously the document reads the numbered list items part of the
`layout-parser-paper-fast.pdf` file as:
```
'1. An off-the-shelf toolkit for applying DL models for layout detection, character'
'recognition, and other DIA tasks (Section 3)'
'2. A rich repository of pre-trained neural network models (Model Zoo) that'
'underlies the off-the-shelf usage'
'3. Comprehensive tools for efficient document image data annotation and model'
'tuning to support different levels of customization'
'4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)'
```
Now it reads:
```
'1. An off-the-shelf toolkit for applying DL models for layout detection, character recognition, and other DIA tasks (Section 3)'
'2. A rich repository of pre-trained neural network models (Model Zoo) that underlies the off-the-shelf usage'
'3. Comprehensive tools for efficient document image data annotation and model' tuning to support different levels of customization'
'4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)'
```
The added logic leverages `ElementType` and `coordinates` to determine
whether the following lines is a part of the previously detected
`ListItem` or not.
**Test**
Add test that checks the element length less than original version with
broken numbered list. The test also checks whether the first detected
numbered list ends with previously broken line.
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-13 17:30:06 -04:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename",
|
|
|
|
["example-docs/layout-parser-paper-fast.pdf"],
|
|
|
|
)
|
|
|
|
def test_combine_numbered_list(filename):
|
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="auto")
|
|
|
|
first_list_element = None
|
|
|
|
for element in elements:
|
|
|
|
if isinstance(element, ListItem):
|
|
|
|
first_list_element = element
|
|
|
|
break
|
|
|
|
assert len(elements) < 28
|
feat: get embedded url, associate text and start index for pdf (#1539)
**Executive Summary**
Adds PDF functionality to capture hyperlink (external or internal) for
pdf fast strategy along with associate text.
**Technical Details**
- `pdfminer` associates `annotation` (links and uris) with bounding box
rather than text. Therefore, the link and text matching is not a perfect
pair but rather a logic-based and calculation matching from bounding box
overlapping.
- There is no word-level bounding box. Only character-level (access
using `LTChar`). Thus in order to get to word-level, there is a window
slicing through the text. The words are captured in alphanumeric and
non-alphanumeric separately, meaning it will split the word if contains
both, on the first encounter of non-alphanumeric.)
- The bounding box calculation is calculated using start and stop
coordinates for the corresponding word calculated from above. The
calculation is simply using distance between two dots.
The result now contains `links` in `metadata` as shown below:
```
"links": [
{
"text": "link",
"url": "https://github.com/Unstructured-IO/unstructured",
"start_index": 12
},
{
"text": "email",
"url": "mailto:unstructuredai@earlygrowth.com",
"start_index": 30
},
{
"text": "phone number",
"url": "tel:6505124019",
"start_index": 49
}
]
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-27 13:43:32 -04:00
|
|
|
assert first_list_element.text.endswith(
|
|
|
|
"character recognition, and other DIA tasks (Section 3)",
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename",
|
|
|
|
["example-docs/layout-parser-paper-fast.pdf"],
|
|
|
|
)
|
2023-10-03 11:25:20 -04:00
|
|
|
def test_partition_pdf_hyperlinks(filename):
|
feat: get embedded url, associate text and start index for pdf (#1539)
**Executive Summary**
Adds PDF functionality to capture hyperlink (external or internal) for
pdf fast strategy along with associate text.
**Technical Details**
- `pdfminer` associates `annotation` (links and uris) with bounding box
rather than text. Therefore, the link and text matching is not a perfect
pair but rather a logic-based and calculation matching from bounding box
overlapping.
- There is no word-level bounding box. Only character-level (access
using `LTChar`). Thus in order to get to word-level, there is a window
slicing through the text. The words are captured in alphanumeric and
non-alphanumeric separately, meaning it will split the word if contains
both, on the first encounter of non-alphanumeric.)
- The bounding box calculation is calculated using start and stop
coordinates for the corresponding word calculated from above. The
calculation is simply using distance between two dots.
The result now contains `links` in `metadata` as shown below:
```
"links": [
{
"text": "link",
"url": "https://github.com/Unstructured-IO/unstructured",
"start_index": 12
},
{
"text": "email",
"url": "mailto:unstructuredai@earlygrowth.com",
"start_index": 30
},
{
"text": "phone number",
"url": "tel:6505124019",
"start_index": 49
}
]
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-27 13:43:32 -04:00
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="auto")
|
|
|
|
links = [
|
|
|
|
{
|
|
|
|
"text": "8",
|
|
|
|
"url": "cite.gardner2018allennlp",
|
|
|
|
"start_index": 138,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"text": "34",
|
|
|
|
"url": "cite.wolf2019huggingface",
|
|
|
|
"start_index": 141,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"text": "35",
|
|
|
|
"url": "cite.wu2019detectron2",
|
|
|
|
"start_index": 168,
|
|
|
|
},
|
|
|
|
]
|
|
|
|
assert elements[-1].metadata.links == links
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename",
|
|
|
|
["example-docs/embedded-link.pdf"],
|
|
|
|
)
|
2023-10-03 11:25:20 -04:00
|
|
|
def test_partition_pdf_hyperlinks_multiple_lines(filename):
|
feat: get embedded url, associate text and start index for pdf (#1539)
**Executive Summary**
Adds PDF functionality to capture hyperlink (external or internal) for
pdf fast strategy along with associate text.
**Technical Details**
- `pdfminer` associates `annotation` (links and uris) with bounding box
rather than text. Therefore, the link and text matching is not a perfect
pair but rather a logic-based and calculation matching from bounding box
overlapping.
- There is no word-level bounding box. Only character-level (access
using `LTChar`). Thus in order to get to word-level, there is a window
slicing through the text. The words are captured in alphanumeric and
non-alphanumeric separately, meaning it will split the word if contains
both, on the first encounter of non-alphanumeric.)
- The bounding box calculation is calculated using start and stop
coordinates for the corresponding word calculated from above. The
calculation is simply using distance between two dots.
The result now contains `links` in `metadata` as shown below:
```
"links": [
{
"text": "link",
"url": "https://github.com/Unstructured-IO/unstructured",
"start_index": 12
},
{
"text": "email",
"url": "mailto:unstructuredai@earlygrowth.com",
"start_index": 30
},
{
"text": "phone number",
"url": "tel:6505124019",
"start_index": 49
}
]
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-27 13:43:32 -04:00
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="auto")
|
|
|
|
assert elements[-1].metadata.links[-1]["text"] == "capturing"
|
|
|
|
assert len(elements[-1].metadata.links) == 2
|
2023-09-15 15:09:58 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_uses_model_name():
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_or_image_local",
|
|
|
|
) as mockpartition:
|
|
|
|
pdf.partition_pdf(
|
|
|
|
"example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
model_name="test",
|
|
|
|
strategy="hi_res",
|
|
|
|
)
|
|
|
|
|
|
|
|
mockpartition.assert_called_once()
|
|
|
|
assert "model_name" in mockpartition.call_args.kwargs
|
|
|
|
assert mockpartition.call_args.kwargs["model_name"]
|
2023-10-03 11:25:20 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_word_bbox_not_char(
|
|
|
|
filename="example-docs/interface-config-guide-p93.pdf",
|
|
|
|
):
|
|
|
|
try:
|
|
|
|
elements = pdf.partition_pdf(filename=filename)
|
|
|
|
except Exception as e:
|
|
|
|
raise ("Partitioning fail: %s" % e)
|
|
|
|
assert len(elements) == 17
|