mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-18 13:45:45 +00:00
chore: Update PDFDocument to use from_file method (#35)
* update PDFDocument to use from_file method * bump version
This commit is contained in:
parent
2d5dba0ddc
commit
704d6e11d1
@ -1,5 +1,6 @@
|
||||
## 0.2.1-dev8
|
||||
## 0.2.1-dev9
|
||||
|
||||
* Update `PDFDocument` to use the `from_file` method
|
||||
* Added staging brick for CSV format for ISD (Initial Structured Data) format.
|
||||
* Added staging brick for separating text into attention window size chunks for `transformers`.
|
||||
* Added staging brick for LabelBox.
|
||||
|
@ -100,15 +100,13 @@ def test_read_pdf(monkeypatch, mock_page_layout):
|
||||
images = [image, image]
|
||||
|
||||
layouts = Layout([mock_page_layout, mock_page_layout])
|
||||
page = PDFPage(number=0, image=image, layout=mock_page_layout)
|
||||
|
||||
monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout))
|
||||
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
|
||||
|
||||
doc = PDFDocument("fake-file.pdf")
|
||||
|
||||
with patch.object(lp, "load_pdf", return_value=(layouts, images)):
|
||||
page.get_elements()
|
||||
doc = PDFDocument.from_file("fake-file.pdf")
|
||||
|
||||
assert str(doc).startswith("A Catchy Title")
|
||||
assert str(doc).count("A Catchy Title") == 2 # Once for each page
|
||||
assert str(doc).endswith("A very repetitive narrative. ")
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.2.1-dev8" # pragma: no cover
|
||||
__version__ = "0.2.1-dev9" # pragma: no cover
|
||||
|
@ -1,5 +1,5 @@
|
||||
from __future__ import annotations
|
||||
from abc import ABC, abstractmethod
|
||||
from abc import ABC
|
||||
from typing import List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element, NarrativeText
|
||||
@ -15,7 +15,6 @@ class Document(ABC):
|
||||
def __str__(self) -> str:
|
||||
return "\n\n".join([str(page) for page in self.pages])
|
||||
|
||||
@abstractmethod
|
||||
def _read(self) -> List[Page]: # pragma: no cover
|
||||
pass
|
||||
|
||||
|
@ -19,7 +19,7 @@ class PDFDocument(Document):
|
||||
document image analysis (DIA) model detects the layout of the page prior to extracting
|
||||
element."""
|
||||
|
||||
def __init__(self, filename):
|
||||
def __init__(self):
|
||||
print(
|
||||
"""
|
||||
|
||||
@ -29,12 +29,12 @@ WARNING: PDF parsing capabilities in unstructured is still experimental
|
||||
|
||||
"""
|
||||
)
|
||||
self.filename = filename
|
||||
super().__init__()
|
||||
|
||||
def _read(self) -> List[Page]:
|
||||
logger.info(f"Reading PDF for file: {self.filename} ...")
|
||||
layouts, images = lp.load_pdf(self.filename, load_images=True)
|
||||
@classmethod
|
||||
def from_file(cls, filename: str):
|
||||
logger.info(f"Reading PDF for file: {filename} ...")
|
||||
layouts, images = lp.load_pdf(filename, load_images=True)
|
||||
pages: List[Page] = list()
|
||||
for i, layout in enumerate(layouts):
|
||||
image = images[i]
|
||||
@ -43,7 +43,7 @@ WARNING: PDF parsing capabilities in unstructured is still experimental
|
||||
page = PDFPage(number=i, image=image, layout=layout)
|
||||
page.get_elements()
|
||||
pages.append(page)
|
||||
return pages
|
||||
return cls.from_pages(pages)
|
||||
|
||||
|
||||
class PDFPage(Page):
|
||||
|
Loading…
x
Reference in New Issue
Block a user