mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-26 06:36:06 +00:00
Don't instantiate an element with a coordinate system when there isn't a way to get its location (#913)
This commit is contained in:
parent
b3936893b8
commit
2635b0be07
@ -1,4 +1,4 @@
|
||||
## 0.8.1-dev1
|
||||
## 0.8.1-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
|
||||
* Fixed `auto` strategy detected scanned document as having extractable text and using `fast` strategy, resulting in no output.
|
||||
* Fix list detection in MS Word documents.
|
||||
* Don't instantiate an element with a coordinate system when there isn't a way to get its location data.
|
||||
|
||||
## 0.8.0
|
||||
|
||||
|
||||
@ -4,6 +4,9 @@ import zipfile
|
||||
|
||||
import magic
|
||||
import pytest
|
||||
from PIL import Image
|
||||
from unstructured_inference.inference import layout
|
||||
from unstructured_inference.inference.layoutelement import LocationlessLayoutElement
|
||||
|
||||
from unstructured.file_utils import filetype
|
||||
from unstructured.file_utils.filetype import (
|
||||
@ -12,6 +15,7 @@ from unstructured.file_utils.filetype import (
|
||||
_is_text_file_a_csv,
|
||||
_is_text_file_a_json,
|
||||
detect_filetype,
|
||||
document_to_element_list,
|
||||
)
|
||||
|
||||
FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
@ -26,6 +30,29 @@ XLSX_MIME_TYPES = [
|
||||
]
|
||||
|
||||
|
||||
class MockPageLayout(layout.PageLayout):
|
||||
def __init__(self, number: int, image: Image):
|
||||
self.number = number
|
||||
self.image = image
|
||||
|
||||
@property
|
||||
def elements(self):
|
||||
return [
|
||||
LocationlessLayoutElement(
|
||||
type="Headline",
|
||||
text="Charlie Brown and the Great Pumpkin",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class MockDocumentLayout(layout.DocumentLayout):
|
||||
@property
|
||||
def pages(self):
|
||||
return [
|
||||
MockPageLayout(number=1, image=Image.new("1", (1, 1))),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("file", "expected"),
|
||||
[
|
||||
@ -436,3 +463,9 @@ def test_detect_filetype_skips_escape_commas_for_csv(tmpdir):
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
assert detect_filetype(file=f) == FileType.CSV
|
||||
|
||||
|
||||
def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
|
||||
layout_elem_absent_coordinates = MockDocumentLayout()
|
||||
elements = document_to_element_list(layout_elem_absent_coordinates)
|
||||
assert elements[0].metadata.coordinates is None
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.8.1-dev1" # pragma: no cover
|
||||
__version__ = "0.8.1-dev2" # pragma: no cover
|
||||
|
||||
@ -451,7 +451,7 @@ def document_to_element_list(
|
||||
for i, page in enumerate(document.pages):
|
||||
page_elements: List[Element] = []
|
||||
for layout_element in page.elements:
|
||||
if hasattr(page, "image"):
|
||||
if hasattr(page, "image") and hasattr(layout_element, "coordinates"):
|
||||
image_format = page.image.format
|
||||
coordinate_system = PixelSpace(width=page.image.width, height=page.image.height)
|
||||
else:
|
||||
|
||||
@ -326,12 +326,16 @@ def _process_pdfminer_pages(
|
||||
_text = clean_extra_whitespace(_text)
|
||||
if _text.strip():
|
||||
text_segments.append(_text)
|
||||
element = element_from_text(_text)
|
||||
coordinate_system = PixelSpace(
|
||||
width=width,
|
||||
height=height,
|
||||
)
|
||||
points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
|
||||
element = element_from_text(
|
||||
_text,
|
||||
coordinates=points,
|
||||
coordinate_system=coordinate_system,
|
||||
)
|
||||
coordinates_metadata = CoordinatesMetadata(
|
||||
points=points,
|
||||
system=coordinate_system,
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
import re
|
||||
from typing import IO, Callable, List, Optional
|
||||
from typing import IO, Callable, List, Optional, Tuple
|
||||
|
||||
from unstructured.cleaners.core import clean_bullets, group_broken_paragraphs
|
||||
from unstructured.documents.coordinates import CoordinateSystem
|
||||
from unstructured.documents.elements import (
|
||||
Address,
|
||||
Element,
|
||||
@ -143,14 +144,26 @@ def partition_text(
|
||||
return elements
|
||||
|
||||
|
||||
def element_from_text(text: str) -> Element:
|
||||
def element_from_text(
|
||||
text: str,
|
||||
coordinates: Optional[Tuple[Tuple[float, float], ...]] = None,
|
||||
coordinate_system: Optional[CoordinateSystem] = None,
|
||||
) -> Element:
|
||||
if is_bulleted_text(text):
|
||||
return ListItem(text=clean_bullets(text))
|
||||
return ListItem(
|
||||
text=clean_bullets(text),
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
)
|
||||
elif is_us_city_state_zip(text):
|
||||
return Address(text=text)
|
||||
return Address(text=text, coordinates=coordinates, coordinate_system=coordinate_system)
|
||||
elif is_possible_narrative_text(text):
|
||||
return NarrativeText(text=text)
|
||||
return NarrativeText(
|
||||
text=text,
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
)
|
||||
elif is_possible_title(text):
|
||||
return Title(text=text)
|
||||
return Title(text=text, coordinates=coordinates, coordinate_system=coordinate_system)
|
||||
else:
|
||||
return Text(text=text)
|
||||
return Text(text=text, coordinates=coordinates, coordinate_system=coordinate_system)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user