chore: Update PDFDocument to use from_file method (#35)

* update PDFDocument to use from_file method * bump version
2025-12-18 10:44:23 +00:00 · 2022-10-13 12:04:30 -04:00 · 2022-10-13 12:04:30 -04:00 · 704d6e11d1
commit 704d6e11d1
parent 2d5dba0ddc
5 changed files with 12 additions and 14 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,6 @@
-## 0.2.1-dev8
+## 0.2.1-dev9

+* Update `PDFDocument` to use the `from_file` method
 * Added staging brick for CSV format for ISD (Initial Structured Data) format.
 * Added staging brick for separating text into attention window size chunks for `transformers`.
 * Added staging brick for LabelBox.
--- a/test_unstructured/documents/test_pdf.py
+++ b/test_unstructured/documents/test_pdf.py
@ -100,15 +100,13 @@ def test_read_pdf(monkeypatch, mock_page_layout):
    images = [image, image]

    layouts = Layout([mock_page_layout, mock_page_layout])
-    page = PDFPage(number=0, image=image, layout=mock_page_layout)

    monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout))
    monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)

-    doc = PDFDocument("fake-file.pdf")
-
    with patch.object(lp, "load_pdf", return_value=(layouts, images)):
-        page.get_elements()
+        doc = PDFDocument.from_file("fake-file.pdf")
+
        assert str(doc).startswith("A Catchy Title")
        assert str(doc).count("A Catchy Title") == 2  # Once for each page
        assert str(doc).endswith("A very repetitive narrative. ")
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.2.1-dev8"  # pragma: no cover
+__version__ = "0.2.1-dev9"  # pragma: no cover
--- a/unstructured/documents/base.py
+++ b/unstructured/documents/base.py
@ -1,5 +1,5 @@
 from __future__ import annotations
-from abc import ABC, abstractmethod
+from abc import ABC
 from typing import List, Optional

 from unstructured.documents.elements import Element, NarrativeText
@ -15,7 +15,6 @@ class Document(ABC):
    def __str__(self) -> str:
        return "\n\n".join([str(page) for page in self.pages])

-    @abstractmethod
    def _read(self) -> List[Page]:  # pragma: no cover
        pass

--- a/unstructured/documents/pdf.py
+++ b/unstructured/documents/pdf.py
@ -19,7 +19,7 @@ class PDFDocument(Document):
    document image analysis (DIA) model detects the layout of the page prior to extracting
    element."""

-    def __init__(self, filename):
+    def __init__(self):
        print(
            """

@ -29,12 +29,12 @@ WARNING: PDF parsing capabilities in unstructured is still experimental

 """
        )
-        self.filename = filename
        super().__init__()

-    def _read(self) -> List[Page]:
-        logger.info(f"Reading PDF for file: {self.filename} ...")
-        layouts, images = lp.load_pdf(self.filename, load_images=True)
+    @classmethod
+    def from_file(cls, filename: str):
+        logger.info(f"Reading PDF for file: {filename} ...")
+        layouts, images = lp.load_pdf(filename, load_images=True)
        pages: List[Page] = list()
        for i, layout in enumerate(layouts):
            image = images[i]
@ -43,7 +43,7 @@ WARNING: PDF parsing capabilities in unstructured is still experimental
            page = PDFPage(number=i, image=image, layout=layout)
            page.get_elements()
            pages.append(page)
-        return pages
+        return cls.from_pages(pages)


 class PDFPage(Page):