chore: Update PDFDocument to use from_file method (#35)

* update PDFDocument to use from_file method

* bump version
This commit is contained in:
Matt Robinson 2022-10-13 12:04:30 -04:00 committed by GitHub
parent 2d5dba0ddc
commit 704d6e11d1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 12 additions and 14 deletions

View File

@ -1,5 +1,6 @@
## 0.2.1-dev8
## 0.2.1-dev9
* Update `PDFDocument` to use the `from_file` method
* Added staging brick for CSV format for ISD (Initial Structured Data) format.
* Added staging brick for separating text into attention window size chunks for `transformers`.
* Added staging brick for LabelBox.

View File

@ -100,15 +100,13 @@ def test_read_pdf(monkeypatch, mock_page_layout):
images = [image, image]
layouts = Layout([mock_page_layout, mock_page_layout])
page = PDFPage(number=0, image=image, layout=mock_page_layout)
monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout))
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
doc = PDFDocument("fake-file.pdf")
with patch.object(lp, "load_pdf", return_value=(layouts, images)):
page.get_elements()
doc = PDFDocument.from_file("fake-file.pdf")
assert str(doc).startswith("A Catchy Title")
assert str(doc).count("A Catchy Title") == 2 # Once for each page
assert str(doc).endswith("A very repetitive narrative. ")

View File

@ -1 +1 @@
__version__ = "0.2.1-dev8" # pragma: no cover
__version__ = "0.2.1-dev9" # pragma: no cover

View File

@ -1,5 +1,5 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from abc import ABC
from typing import List, Optional
from unstructured.documents.elements import Element, NarrativeText
@ -15,7 +15,6 @@ class Document(ABC):
def __str__(self) -> str:
return "\n\n".join([str(page) for page in self.pages])
@abstractmethod
def _read(self) -> List[Page]: # pragma: no cover
pass

View File

@ -19,7 +19,7 @@ class PDFDocument(Document):
document image analysis (DIA) model detects the layout of the page prior to extracting
element."""
def __init__(self, filename):
def __init__(self):
print(
"""
@ -29,12 +29,12 @@ WARNING: PDF parsing capabilities in unstructured is still experimental
"""
)
self.filename = filename
super().__init__()
def _read(self) -> List[Page]:
logger.info(f"Reading PDF for file: {self.filename} ...")
layouts, images = lp.load_pdf(self.filename, load_images=True)
@classmethod
def from_file(cls, filename: str):
logger.info(f"Reading PDF for file: {filename} ...")
layouts, images = lp.load_pdf(filename, load_images=True)
pages: List[Page] = list()
for i, layout in enumerate(layouts):
image = images[i]
@ -43,7 +43,7 @@ WARNING: PDF parsing capabilities in unstructured is still experimental
page = PDFPage(number=i, image=image, layout=layout)
page.get_elements()
pages.append(page)
return pages
return cls.from_pages(pages)
class PDFPage(Page):