chore: Update PDFDocument to use from_file method (#35)

* update PDFDocument to use from_file method

* bump version
This commit is contained in:
Matt Robinson 2022-10-13 12:04:30 -04:00 committed by GitHub
parent 2d5dba0ddc
commit 704d6e11d1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 12 additions and 14 deletions

View File

@ -1,5 +1,6 @@
## 0.2.1-dev8 ## 0.2.1-dev9
* Update `PDFDocument` to use the `from_file` method
* Added staging brick for CSV format for ISD (Initial Structured Data) format. * Added staging brick for CSV format for ISD (Initial Structured Data) format.
* Added staging brick for separating text into attention window size chunks for `transformers`. * Added staging brick for separating text into attention window size chunks for `transformers`.
* Added staging brick for LabelBox. * Added staging brick for LabelBox.

View File

@ -100,15 +100,13 @@ def test_read_pdf(monkeypatch, mock_page_layout):
images = [image, image] images = [image, image]
layouts = Layout([mock_page_layout, mock_page_layout]) layouts = Layout([mock_page_layout, mock_page_layout])
page = PDFPage(number=0, image=image, layout=mock_page_layout)
monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout)) monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout))
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
doc = PDFDocument("fake-file.pdf")
with patch.object(lp, "load_pdf", return_value=(layouts, images)): with patch.object(lp, "load_pdf", return_value=(layouts, images)):
page.get_elements() doc = PDFDocument.from_file("fake-file.pdf")
assert str(doc).startswith("A Catchy Title") assert str(doc).startswith("A Catchy Title")
assert str(doc).count("A Catchy Title") == 2 # Once for each page assert str(doc).count("A Catchy Title") == 2 # Once for each page
assert str(doc).endswith("A very repetitive narrative. ") assert str(doc).endswith("A very repetitive narrative. ")

View File

@ -1 +1 @@
__version__ = "0.2.1-dev8" # pragma: no cover __version__ = "0.2.1-dev9" # pragma: no cover

View File

@ -1,5 +1,5 @@
from __future__ import annotations from __future__ import annotations
from abc import ABC, abstractmethod from abc import ABC
from typing import List, Optional from typing import List, Optional
from unstructured.documents.elements import Element, NarrativeText from unstructured.documents.elements import Element, NarrativeText
@ -15,7 +15,6 @@ class Document(ABC):
def __str__(self) -> str: def __str__(self) -> str:
return "\n\n".join([str(page) for page in self.pages]) return "\n\n".join([str(page) for page in self.pages])
@abstractmethod
def _read(self) -> List[Page]: # pragma: no cover def _read(self) -> List[Page]: # pragma: no cover
pass pass

View File

@ -19,7 +19,7 @@ class PDFDocument(Document):
document image analysis (DIA) model detects the layout of the page prior to extracting document image analysis (DIA) model detects the layout of the page prior to extracting
element.""" element."""
def __init__(self, filename): def __init__(self):
print( print(
""" """
@ -29,12 +29,12 @@ WARNING: PDF parsing capabilities in unstructured is still experimental
""" """
) )
self.filename = filename
super().__init__() super().__init__()
def _read(self) -> List[Page]: @classmethod
logger.info(f"Reading PDF for file: {self.filename} ...") def from_file(cls, filename: str):
layouts, images = lp.load_pdf(self.filename, load_images=True) logger.info(f"Reading PDF for file: {filename} ...")
layouts, images = lp.load_pdf(filename, load_images=True)
pages: List[Page] = list() pages: List[Page] = list()
for i, layout in enumerate(layouts): for i, layout in enumerate(layouts):
image = images[i] image = images[i]
@ -43,7 +43,7 @@ WARNING: PDF parsing capabilities in unstructured is still experimental
page = PDFPage(number=i, image=image, layout=layout) page = PDFPage(number=i, image=image, layout=layout)
page.get_elements() page.get_elements()
pages.append(page) pages.append(page)
return pages return cls.from_pages(pages)
class PDFPage(Page): class PDFPage(Page):