mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-18 13:45:45 +00:00
chore: Update PDFDocument to use from_file method (#35)
* update PDFDocument to use from_file method * bump version
This commit is contained in:
parent
2d5dba0ddc
commit
704d6e11d1
@ -1,5 +1,6 @@
|
|||||||
## 0.2.1-dev8
|
## 0.2.1-dev9
|
||||||
|
|
||||||
|
* Update `PDFDocument` to use the `from_file` method
|
||||||
* Added staging brick for CSV format for ISD (Initial Structured Data) format.
|
* Added staging brick for CSV format for ISD (Initial Structured Data) format.
|
||||||
* Added staging brick for separating text into attention window size chunks for `transformers`.
|
* Added staging brick for separating text into attention window size chunks for `transformers`.
|
||||||
* Added staging brick for LabelBox.
|
* Added staging brick for LabelBox.
|
||||||
|
@ -100,15 +100,13 @@ def test_read_pdf(monkeypatch, mock_page_layout):
|
|||||||
images = [image, image]
|
images = [image, image]
|
||||||
|
|
||||||
layouts = Layout([mock_page_layout, mock_page_layout])
|
layouts = Layout([mock_page_layout, mock_page_layout])
|
||||||
page = PDFPage(number=0, image=image, layout=mock_page_layout)
|
|
||||||
|
|
||||||
monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout))
|
monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout))
|
||||||
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
|
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
|
||||||
|
|
||||||
doc = PDFDocument("fake-file.pdf")
|
|
||||||
|
|
||||||
with patch.object(lp, "load_pdf", return_value=(layouts, images)):
|
with patch.object(lp, "load_pdf", return_value=(layouts, images)):
|
||||||
page.get_elements()
|
doc = PDFDocument.from_file("fake-file.pdf")
|
||||||
|
|
||||||
assert str(doc).startswith("A Catchy Title")
|
assert str(doc).startswith("A Catchy Title")
|
||||||
assert str(doc).count("A Catchy Title") == 2 # Once for each page
|
assert str(doc).count("A Catchy Title") == 2 # Once for each page
|
||||||
assert str(doc).endswith("A very repetitive narrative. ")
|
assert str(doc).endswith("A very repetitive narrative. ")
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.2.1-dev8" # pragma: no cover
|
__version__ = "0.2.1-dev9" # pragma: no cover
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from unstructured.documents.elements import Element, NarrativeText
|
from unstructured.documents.elements import Element, NarrativeText
|
||||||
@ -15,7 +15,6 @@ class Document(ABC):
|
|||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
return "\n\n".join([str(page) for page in self.pages])
|
return "\n\n".join([str(page) for page in self.pages])
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def _read(self) -> List[Page]: # pragma: no cover
|
def _read(self) -> List[Page]: # pragma: no cover
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@ class PDFDocument(Document):
|
|||||||
document image analysis (DIA) model detects the layout of the page prior to extracting
|
document image analysis (DIA) model detects the layout of the page prior to extracting
|
||||||
element."""
|
element."""
|
||||||
|
|
||||||
def __init__(self, filename):
|
def __init__(self):
|
||||||
print(
|
print(
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -29,12 +29,12 @@ WARNING: PDF parsing capabilities in unstructured is still experimental
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
self.filename = filename
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
def _read(self) -> List[Page]:
|
@classmethod
|
||||||
logger.info(f"Reading PDF for file: {self.filename} ...")
|
def from_file(cls, filename: str):
|
||||||
layouts, images = lp.load_pdf(self.filename, load_images=True)
|
logger.info(f"Reading PDF for file: {filename} ...")
|
||||||
|
layouts, images = lp.load_pdf(filename, load_images=True)
|
||||||
pages: List[Page] = list()
|
pages: List[Page] = list()
|
||||||
for i, layout in enumerate(layouts):
|
for i, layout in enumerate(layouts):
|
||||||
image = images[i]
|
image = images[i]
|
||||||
@ -43,7 +43,7 @@ WARNING: PDF parsing capabilities in unstructured is still experimental
|
|||||||
page = PDFPage(number=i, image=image, layout=layout)
|
page = PDFPage(number=i, image=image, layout=layout)
|
||||||
page.get_elements()
|
page.get_elements()
|
||||||
pages.append(page)
|
pages.append(page)
|
||||||
return pages
|
return cls.from_pages(pages)
|
||||||
|
|
||||||
|
|
||||||
class PDFPage(Page):
|
class PDFPage(Page):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user