Feat: form parsing placeholders (#3034)

Allows introduction of form extraction in the future - sets up the
FormKeysValues element & format, puts in an empty function call in the
partition_pdf_or_image pipeline.
This commit is contained in:
Jan Kanty Milczek 2024-05-16 16:21:31 +02:00 committed by GitHub
parent 1fb0fe5cf5
commit e6ada05c55
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 285 additions and 29 deletions

View File

@ -1,4 +1,4 @@
## 0.13.8-dev12 ## 0.13.8-dev13
### Enhancements ### Enhancements
@ -7,6 +7,7 @@
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy. * **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
### Features ### Features
* **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`.
### Fixes ### Fixes

View File

@ -0,0 +1,149 @@
[
{
"type": "FormKeysValues",
"element_id": "MOCK_FORM_ID",
"text": "",
"metadata": {
"coordinates": {
"points": [
[
35.15625,
95.556640625
],
[
710.357666015625,
95.556640625
],
[
710.357666015625,
887.890625
],
[
35.15625,
887.890625
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1,
"key_value_pairs": [
{
"key": {
"text": "MOCK KEY",
"custom_element": {
"type": "UncategorizedText",
"element_id": "MOCK_KEY_ID_1",
"text": "MOCK KEY",
"metadata": {
"coordinates": {
"points": [
[
503.271484375,
96.3897705078125
],
[
503.271484375,
107.5164794921875
],
[
606.103515625,
107.5164794921875
],
[
606.103515625,
96.3897705078125
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1
}
},
"layout_element_id": null
},
"value": {
"text": "MOCK VALUE",
"custom_element": {
"type": "UncategorizedText",
"element_id": "MOCK_VALUE_ID",
"text": "MOCK VALUE",
"metadata": {
"coordinates": {
"points": [
[
557.568359375,
124.8626708984375
],
[
557.568359375,
136.6607666015625
],
[
595.556640625,
136.6607666015625
],
[
595.556640625,
124.8626708984375
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1
}
},
"layout_element_id": null
},
"confidence": 0.0
},
{
"key": {
"text": "MOCK KEY 2",
"custom_element": {
"type": "UncategorizedText",
"element_id": "MOCK_KEY_ID_2",
"text": "MOCK KEY 2",
"metadata": {
"coordinates": {
"points": [
[
428.52783203125,
124.0478515625
],
[
428.52783203125,
136.6943359375
],
[
473.81591796875,
136.6943359375
],
[
473.81591796875,
124.0478515625
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1
}
},
"layout_element_id": null
},
"value": null,
"confidence": 0.0
}
],
"file_directory": "dataset/testing_data/images",
"filename": "MOCK.png"
}
}
]

View File

@ -5,13 +5,14 @@
from __future__ import annotations from __future__ import annotations
import copy import copy
import io
import json import json
import pathlib import pathlib
from functools import partial from functools import partial
import pytest import pytest
from test_unstructured.unit_utils import assign_hash_ids from test_unstructured.unit_utils import assign_hash_ids, example_doc_path
from unstructured.cleaners.core import clean_bullets, clean_prefix from unstructured.cleaners.core import clean_bullets, clean_prefix
from unstructured.documents.coordinates import ( from unstructured.documents.coordinates import (
CoordinateSystem, CoordinateSystem,
@ -31,6 +32,7 @@ from unstructured.documents.elements import (
Title, Title,
assign_and_map_hash_ids, assign_and_map_hash_ids,
) )
from unstructured.partition.json import partition_json
@pytest.mark.parametrize("element", [Element(), Text(text=""), CheckBox()]) @pytest.mark.parametrize("element", [Element(), Text(text=""), CheckBox()])
@ -744,3 +746,13 @@ def test_id_to_hash_calculates(text, sequence_number, filename, page_number, exp
) )
assert element.id_to_hash(sequence_number) == expected_hash, "Returned ID does not match" assert element.id_to_hash(sequence_number) == expected_hash, "Returned ID does not match"
assert element.id == expected_hash, "ID should be set" assert element.id == expected_hash, "ID should be set"
def test_formskeysvalues_reads_saves():
filename = example_doc_path("test_evaluate_files/unstructured_output/form.json")
as_read = partition_json(filename=filename)
tmp_file = io.StringIO()
json.dump([element.to_dict() for element in as_read], tmp_file)
tmp_file.seek(0)
as_read_2 = partition_json(file=tmp_file)
assert as_read == as_read_2

View File

@ -1 +1 @@
__version__ = "0.13.8-dev12" # pragma: no cover __version__ = "0.13.8-dev13" # pragma: no cover

View File

@ -143,6 +143,18 @@ class Link(TypedDict):
start_index: int start_index: int
class FormKeyOrValue(TypedDict):
text: str
layout_element_id: Optional[str]
custom_element: Optional[Text]
class FormKeyValuePair(TypedDict):
key: FormKeyOrValue
value: Optional[FormKeyOrValue]
confidence: float
class ElementMetadata: class ElementMetadata:
"""Fully-dynamic replacement for dataclass-based ElementMetadata.""" """Fully-dynamic replacement for dataclass-based ElementMetadata."""
@ -176,6 +188,7 @@ class ElementMetadata:
header_footer_type: Optional[str] header_footer_type: Optional[str]
# -- used in chunks only, when chunk must be split mid-text to fit window -- # -- used in chunks only, when chunk must be split mid-text to fit window --
is_continuation: Optional[bool] is_continuation: Optional[bool]
key_value_pairs: Optional[list[FormKeyValuePair]]
languages: Optional[list[str]] languages: Optional[list[str]]
last_modified: Optional[str] last_modified: Optional[str]
link_texts: Optional[list[str]] link_texts: Optional[list[str]]
@ -327,6 +340,8 @@ class ElementMetadata:
self.data_source = DataSourceMetadata.from_dict(field_value) self.data_source = DataSourceMetadata.from_dict(field_value)
elif field_name == "orig_elements": elif field_name == "orig_elements":
self.orig_elements = elements_from_base64_gzipped_json(field_value) self.orig_elements = elements_from_base64_gzipped_json(field_value)
elif field_name == "key_value_pairs":
self.key_value_pairs = _kvform_rehydrate_internal_elements(field_value)
else: else:
setattr(self, field_name, field_value) setattr(self, field_name, field_value)
@ -392,6 +407,8 @@ class ElementMetadata:
meta_dict["data_source"] = self.data_source.to_dict() meta_dict["data_source"] = self.data_source.to_dict()
if self.orig_elements is not None: if self.orig_elements is not None:
meta_dict["orig_elements"] = elements_to_base64_gzipped_json(self.orig_elements) meta_dict["orig_elements"] = elements_to_base64_gzipped_json(self.orig_elements)
if self.key_value_pairs is not None:
meta_dict["key_value_pairs"] = _kvform_pairs_to_dict(self.key_value_pairs)
return meta_dict return meta_dict
@ -494,6 +511,7 @@ class ConsolidationStrategy(enum.Enum):
"text_as_html": cls.FIRST, # -- only occurs in Table -- "text_as_html": cls.FIRST, # -- only occurs in Table --
"table_as_cells": cls.FIRST, # -- only occurs in Table -- "table_as_cells": cls.FIRST, # -- only occurs in Table --
"url": cls.FIRST, "url": cls.FIRST,
"key_value_pairs": cls.DROP, # -- only occurs in FormKeysValues --
} }
@ -660,6 +678,7 @@ class ElementType:
PAGE_FOOTER = "Page-footer" PAGE_FOOTER = "Page-footer"
PAGE_NUMBER = "PageNumber" PAGE_NUMBER = "PageNumber"
CODE_SNIPPET = "CodeSnippet" CODE_SNIPPET = "CodeSnippet"
FORM_KEYS_VALUES = "FormKeysValues"
@classmethod @classmethod
def to_dict(cls): def to_dict(cls):
@ -992,6 +1011,12 @@ class PageNumber(Text):
category = "PageNumber" category = "PageNumber"
class FormKeysValues(Text):
"""An element for capturing Key-Value dicts (forms)."""
category = "FormKeysValues"
TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = { TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
ElementType.TITLE: Title, ElementType.TITLE: Title,
ElementType.SECTION_HEADER: Title, ElementType.SECTION_HEADER: Title,
@ -1029,4 +1054,43 @@ TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
ElementType.PAGE_BREAK: PageBreak, ElementType.PAGE_BREAK: PageBreak,
ElementType.CODE_SNIPPET: CodeSnippet, ElementType.CODE_SNIPPET: CodeSnippet,
ElementType.PAGE_NUMBER: PageNumber, ElementType.PAGE_NUMBER: PageNumber,
ElementType.FORM_KEYS_VALUES: FormKeysValues,
} }
def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]:
"""
The key_value_pairs metadata field contains (in the vast majority of cases)
nested Text elements. Those need to be turned from dicts into Elements explicitly,
e.g. when partition_json is used.
"""
from unstructured.staging.base import elements_from_dicts
# safe to overwrite - deepcopy already happened
for kv_pair in kv_pairs:
if kv_pair["key"]["custom_element"] is not None:
(kv_pair["key"]["custom_element"],) = elements_from_dicts(
[kv_pair["key"]["custom_element"]]
)
if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None:
(kv_pair["value"]["custom_element"],) = elements_from_dicts(
[kv_pair["value"]["custom_element"]]
)
return kv_pairs
def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]:
"""
The key_value_pairs metadata field contains (in the vast majority of cases)
nested Text elements. Those need to be turned from Elements to dicts recursively,
e.g. when FormKeysValues.to_dict() is used.
"""
kv_pairs: list[dict] = copy.deepcopy(kv_pairs)
for kv_pair in kv_pairs:
if kv_pair["key"]["custom_element"] is not None:
kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict()
if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None:
kv_pair["value"]["custom_element"] = kv_pair["value"]["custom_element"].to_dict()
return kv_pairs

View File

@ -6,9 +6,7 @@ from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import add_metadata from unstructured.file_utils.filetype import add_metadata
from unstructured.partition.common import exactly_one from unstructured.partition.common import exactly_one
from unstructured.partition.lang import ( from unstructured.partition.lang import check_language_args
check_language_args,
)
from unstructured.partition.pdf import partition_pdf_or_image from unstructured.partition.pdf import partition_pdf_or_image
from unstructured.partition.utils.constants import PartitionStrategy from unstructured.partition.utils.constants import PartitionStrategy
@ -33,6 +31,8 @@ def partition_image(
extract_image_block_to_payload: bool = False, extract_image_block_to_payload: bool = False,
date_from_file_object: bool = False, date_from_file_object: bool = False,
starting_page_number: int = 1, starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
**kwargs: Any, **kwargs: Any,
) -> list[Element]: ) -> list[Element]:
"""Parses an image into a list of interpreted elements. """Parses an image into a list of interpreted elements.
@ -90,6 +90,11 @@ def partition_image(
date_from_file_object date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None. infer last_modified metadata from bytes, otherwise set it to None.
extract_forms
Whether the form extraction logic should be run
(results in adding FormKeysValues elements to output).
form_extraction_skip_tables
Whether the form extraction logic should ignore regions designated as Tables.
""" """
exactly_one(filename=filename, file=file) exactly_one(filename=filename, file=file)
@ -111,5 +116,7 @@ def partition_image(
extract_image_block_to_payload=extract_image_block_to_payload, extract_image_block_to_payload=extract_image_block_to_payload,
date_from_file_object=date_from_file_object, date_from_file_object=date_from_file_object,
starting_page_number=starting_page_number, starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
**kwargs, **kwargs,
) )

View File

@ -12,13 +12,7 @@ import numpy as np
import pdf2image import pdf2image
import wrapt import wrapt
from pdfminer import psparser from pdfminer import psparser
from pdfminer.layout import ( from pdfminer.layout import LTChar, LTContainer, LTImage, LTItem, LTTextBox
LTChar,
LTContainer,
LTImage,
LTItem,
LTTextBox,
)
from pdfminer.pdftypes import PDFObjRef from pdfminer.pdftypes import PDFObjRef
from pdfminer.utils import open_filename from pdfminer.utils import open_filename
from PIL import Image as PILImage from PIL import Image as PILImage
@ -42,10 +36,7 @@ from unstructured.documents.elements import (
Text, Text,
process_metadata, process_metadata,
) )
from unstructured.file_utils.filetype import ( from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
FileType,
add_metadata_with_filetype,
)
from unstructured.logger import logger, trace_logger from unstructured.logger import logger, trace_logger
from unstructured.nlp.patterns import PARAGRAPH_PATTERN from unstructured.nlp.patterns import PARAGRAPH_PATTERN
from unstructured.partition.common import ( from unstructured.partition.common import (
@ -57,10 +48,8 @@ from unstructured.partition.common import (
ocr_data_to_elements, ocr_data_to_elements,
spooled_to_bytes_io_if_needed, spooled_to_bytes_io_if_needed,
) )
from unstructured.partition.lang import ( from unstructured.partition.lang import check_language_args, prepare_languages_for_tesseract
check_language_args, from unstructured.partition.pdf_image.form_extraction import run_form_extraction
prepare_languages_for_tesseract,
)
from unstructured.partition.pdf_image.pdf_image_utils import ( from unstructured.partition.pdf_image.pdf_image_utils import (
annotate_layout_elements, annotate_layout_elements,
check_element_types_to_extract, check_element_types_to_extract,
@ -85,10 +74,7 @@ from unstructured.partition.utils.constants import (
OCRMode, OCRMode,
PartitionStrategy, PartitionStrategy,
) )
from unstructured.partition.utils.sorting import ( from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements
coord_has_valid_points,
sort_page_elements,
)
from unstructured.patches.pdfminer import parse_keyword from unstructured.patches.pdfminer import parse_keyword
from unstructured.utils import requires_dependencies from unstructured.utils import requires_dependencies
@ -135,6 +121,8 @@ def partition_pdf(
extract_image_block_to_payload: bool = False, extract_image_block_to_payload: bool = False,
date_from_file_object: bool = False, date_from_file_object: bool = False,
starting_page_number: int = 1, starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
**kwargs: Any, **kwargs: Any,
) -> list[Element]: ) -> list[Element]:
"""Parses a pdf document into a list of interpreted elements. """Parses a pdf document into a list of interpreted elements.
@ -191,6 +179,11 @@ def partition_pdf(
date_from_file_object date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None. infer last_modified metadata from bytes, otherwise set it to None.
extract_forms
Whether the form extraction logic should be run
(results in adding FormKeysValues elements to output).
form_extraction_skip_tables
Whether the form extraction logic should ignore regions designated as Tables.
""" """
exactly_one(filename=filename, file=file) exactly_one(filename=filename, file=file)
@ -212,6 +205,7 @@ def partition_pdf(
extract_image_block_to_payload=extract_image_block_to_payload, extract_image_block_to_payload=extract_image_block_to_payload,
date_from_file_object=date_from_file_object, date_from_file_object=date_from_file_object,
starting_page_number=starting_page_number, starting_page_number=starting_page_number,
extract_forms=extract_forms,
**kwargs, **kwargs,
) )
@ -233,6 +227,8 @@ def partition_pdf_or_image(
extract_image_block_to_payload: bool = False, extract_image_block_to_payload: bool = False,
date_from_file_object: bool = False, date_from_file_object: bool = False,
starting_page_number: int = 1, starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
**kwargs: Any, **kwargs: Any,
) -> list[Element]: ) -> list[Element]:
"""Parses a pdf or image document into a list of interpreted elements.""" """Parses a pdf or image document into a list of interpreted elements."""
@ -304,6 +300,8 @@ def partition_pdf_or_image(
extract_image_block_output_dir=extract_image_block_output_dir, extract_image_block_output_dir=extract_image_block_output_dir,
extract_image_block_to_payload=extract_image_block_to_payload, extract_image_block_to_payload=extract_image_block_to_payload,
starting_page_number=starting_page_number, starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
**kwargs, **kwargs,
) )
out_elements = _process_uncategorized_text_elements(elements) out_elements = _process_uncategorized_text_elements(elements)
@ -390,6 +388,8 @@ def _partition_pdf_or_image_local(
analysis: bool = False, analysis: bool = False,
analyzed_image_output_dir_path: Optional[str] = None, analyzed_image_output_dir_path: Optional[str] = None,
starting_page_number: int = 1, starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
**kwargs: Any, **kwargs: Any,
) -> list[Element]: ) -> list[Element]:
"""Partition using package installed locally""" """Partition using package installed locally"""
@ -398,10 +398,7 @@ def _partition_pdf_or_image_local(
process_file_with_model, process_file_with_model,
) )
from unstructured.partition.pdf_image.ocr import ( from unstructured.partition.pdf_image.ocr import process_data_with_ocr, process_file_with_ocr
process_data_with_ocr,
process_file_with_ocr,
)
from unstructured.partition.pdf_image.pdfminer_processing import ( from unstructured.partition.pdf_image.pdfminer_processing import (
process_data_with_pdfminer, process_data_with_pdfminer,
process_file_with_pdfminer, process_file_with_pdfminer,
@ -581,6 +578,16 @@ def _partition_pdf_or_image_local(
if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"): if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"):
out_elements.append(cast(Element, el)) out_elements.append(cast(Element, el))
if extract_forms:
forms = run_form_extraction(
file=file,
filename=filename,
model_name=hi_res_model_name,
elements=out_elements,
skip_table_regions=form_extraction_skip_tables,
)
out_elements.extend(forms)
return out_elements return out_elements

View File

@ -0,0 +1,15 @@
from __future__ import annotations
from typing import IO
from unstructured.documents.elements import Element, FormKeysValues
def run_form_extraction(
filename: str,
file: IO[bytes],
model_name: str,
elements: list[Element],
skip_table_regions: bool,
) -> list[FormKeysValues]:
raise NotImplementedError("Form extraction not yet available.")

View File

@ -17,6 +17,7 @@ exclude_metadata_keys = (
"links", "links",
"orig_elements", "orig_elements",
"regex_metadata", "regex_metadata",
"key_value_pairs",
) )