mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-14 16:52:11 +00:00
Fix/fix table id checking logic (#3898)
- there is a bug in deciding if a page has tables before performing table extraction. This logic checks if the id associated with Table type element is True - however, it should be checking if the id is `None` because sometimes the id can be 0 (the first type of element in the page) - the fix updates the logic - adds a unit test for this specific case
This commit is contained in:
parent
a368aac4a3
commit
9d58b34ab4
@ -5,6 +5,7 @@
|
|||||||
### Features
|
### Features
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
- **fix a bug where table extraction is skipped when it shouldn't**. Pages with just one table as its content or starts with a table misses table extraction. The routing logic is now fixed.
|
||||||
|
|
||||||
## 0.16.18-dev1
|
## 0.16.18-dev1
|
||||||
|
|
||||||
|
|||||||
BIN
example-docs/pdf/single_table.pdf
Normal file
BIN
example-docs/pdf/single_table.pdf
Normal file
Binary file not shown.
@ -16,9 +16,13 @@ from unstructured_inference.inference.layoutelement import (
|
|||||||
LayoutElements,
|
LayoutElements,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from test_unstructured.unit_utils import example_doc_path
|
||||||
from unstructured.documents.elements import ElementType
|
from unstructured.documents.elements import ElementType
|
||||||
from unstructured.partition.pdf_image import ocr
|
from unstructured.partition.pdf_image import ocr
|
||||||
from unstructured.partition.pdf_image.pdf_image_utils import pad_element_bboxes
|
from unstructured.partition.pdf_image.pdf_image_utils import (
|
||||||
|
convert_pdf_to_images,
|
||||||
|
pad_element_bboxes,
|
||||||
|
)
|
||||||
from unstructured.partition.utils.config import env_config
|
from unstructured.partition.utils.config import env_config
|
||||||
from unstructured.partition.utils.constants import (
|
from unstructured.partition.utils.constants import (
|
||||||
Source,
|
Source,
|
||||||
@ -436,6 +440,28 @@ def mock_ocr_layout():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_supplement_element_with_table_extraction():
|
||||||
|
from unstructured_inference.models import tables
|
||||||
|
|
||||||
|
tables.load_agent()
|
||||||
|
|
||||||
|
image = next(convert_pdf_to_images(example_doc_path("pdf/single_table.pdf")))
|
||||||
|
elements = LayoutElements(
|
||||||
|
element_coords=np.array([[215.00109863, 731.89996338, 1470.07739258, 972.83129883]]),
|
||||||
|
texts=np.array(["foo"]),
|
||||||
|
sources=np.array(["yolox_sg"]),
|
||||||
|
element_class_ids=np.array([0]),
|
||||||
|
element_class_id_map={0: "Table"},
|
||||||
|
)
|
||||||
|
supplemented = ocr.supplement_element_with_table_extraction(
|
||||||
|
elements=elements,
|
||||||
|
image=image,
|
||||||
|
tables_agent=tables.tables_agent,
|
||||||
|
ocr_agent=ocr.OCRAgent.get_agent(language="eng"),
|
||||||
|
)
|
||||||
|
assert supplemented.text_as_html[0].startswith("<table>")
|
||||||
|
|
||||||
|
|
||||||
def test_get_table_tokens(mock_ocr_layout):
|
def test_get_table_tokens(mock_ocr_layout):
|
||||||
with patch.object(OCRAgentTesseract, "get_layout_from_image", return_value=mock_ocr_layout):
|
with patch.object(OCRAgentTesseract, "get_layout_from_image", return_value=mock_ocr_layout):
|
||||||
ocr_agent = OCRAgent.get_agent(language="eng")
|
ocr_agent = OCRAgent.get_agent(language="eng")
|
||||||
|
|||||||
@ -276,7 +276,7 @@ def supplement_element_with_table_extraction(
|
|||||||
from unstructured_inference.models.tables import cells_to_html
|
from unstructured_inference.models.tables import cells_to_html
|
||||||
|
|
||||||
table_id = {v: k for k, v in elements.element_class_id_map.items()}.get(ElementType.TABLE)
|
table_id = {v: k for k, v in elements.element_class_id_map.items()}.get(ElementType.TABLE)
|
||||||
if not table_id:
|
if table_id is None:
|
||||||
# no table found in this page
|
# no table found in this page
|
||||||
return elements
|
return elements
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user