diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d003e5c0..b0c602828 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ -## 0.2.1-dev8 +## 0.2.1-dev9 +* Update `PDFDocument` to use the `from_file` method * Added staging brick for CSV format for ISD (Initial Structured Data) format. * Added staging brick for separating text into attention window size chunks for `transformers`. * Added staging brick for LabelBox. diff --git a/test_unstructured/documents/test_pdf.py b/test_unstructured/documents/test_pdf.py index 8601ad569..39c87bd9a 100644 --- a/test_unstructured/documents/test_pdf.py +++ b/test_unstructured/documents/test_pdf.py @@ -100,15 +100,13 @@ def test_read_pdf(monkeypatch, mock_page_layout): images = [image, image] layouts = Layout([mock_page_layout, mock_page_layout]) - page = PDFPage(number=0, image=image, layout=mock_page_layout) monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout)) monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) - doc = PDFDocument("fake-file.pdf") - with patch.object(lp, "load_pdf", return_value=(layouts, images)): - page.get_elements() + doc = PDFDocument.from_file("fake-file.pdf") + assert str(doc).startswith("A Catchy Title") assert str(doc).count("A Catchy Title") == 2 # Once for each page assert str(doc).endswith("A very repetitive narrative. ") diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 5aed66a07..a31549ba7 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.2.1-dev8" # pragma: no cover +__version__ = "0.2.1-dev9" # pragma: no cover diff --git a/unstructured/documents/base.py b/unstructured/documents/base.py index 077c0e957..aeb97045e 100644 --- a/unstructured/documents/base.py +++ b/unstructured/documents/base.py @@ -1,5 +1,5 @@ from __future__ import annotations -from abc import ABC, abstractmethod +from abc import ABC from typing import List, Optional from unstructured.documents.elements import Element, NarrativeText @@ -15,7 +15,6 @@ class Document(ABC): def __str__(self) -> str: return "\n\n".join([str(page) for page in self.pages]) - @abstractmethod def _read(self) -> List[Page]: # pragma: no cover pass diff --git a/unstructured/documents/pdf.py b/unstructured/documents/pdf.py index 40e21c93c..f984e8f32 100644 --- a/unstructured/documents/pdf.py +++ b/unstructured/documents/pdf.py @@ -19,7 +19,7 @@ class PDFDocument(Document): document image analysis (DIA) model detects the layout of the page prior to extracting element.""" - def __init__(self, filename): + def __init__(self): print( """ @@ -29,12 +29,12 @@ WARNING: PDF parsing capabilities in unstructured is still experimental """ ) - self.filename = filename super().__init__() - def _read(self) -> List[Page]: - logger.info(f"Reading PDF for file: {self.filename} ...") - layouts, images = lp.load_pdf(self.filename, load_images=True) + @classmethod + def from_file(cls, filename: str): + logger.info(f"Reading PDF for file: {filename} ...") + layouts, images = lp.load_pdf(filename, load_images=True) pages: List[Page] = list() for i, layout in enumerate(layouts): image = images[i] @@ -43,7 +43,7 @@ WARNING: PDF parsing capabilities in unstructured is still experimental page = PDFPage(number=i, image=image, layout=layout) page.get_elements() pages.append(page) - return pages + return cls.from_pages(pages) class PDFPage(Page):