Feat: form parsing placeholders (#3034)

Allows introduction of form extraction in the future - sets up the FormKeysValues element & format, puts in an empty function call in the partition_pdf_or_image pipeline.
2025-09-01 21:04:06 +00:00 · 2024-05-16 16:21:31 +02:00 · 2024-05-16 16:21:31 +02:00 · e6ada05c55
commit e6ada05c55
parent 1fb0fe5cf5
9 changed files with 285 additions and 29 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.13.8-dev12
+## 0.13.8-dev13
 ### Enhancements
@ -7,6 +7,7 @@
 * **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
 ### Features
 * **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`.
 ### Fixes
--- a/example-docs/test_evaluate_files/unstructured_output/form.json
+++ b/example-docs/test_evaluate_files/unstructured_output/form.json
@ -0,0 +1,149 @@
 [
    {
        "type": "FormKeysValues",
        "element_id": "MOCK_FORM_ID",
        "text": "",
        "metadata": {
            "coordinates": {
                "points": [
                    [
                        35.15625,
                        95.556640625
                    ],
                    [
                        710.357666015625,
                        95.556640625
                    ],
                    [
                        710.357666015625,
                        887.890625
                    ],
                    [
                        35.15625,
                        887.890625
                    ]
                ],
                "system": "PixelSpace",
                "layout_width": 754,
                "layout_height": 1000
            },
            "page_number": 1,
            "key_value_pairs": [
                {
                    "key": {
                        "text": "MOCK KEY",
                        "custom_element": {
                            "type": "UncategorizedText",
                            "element_id": "MOCK_KEY_ID_1",
                            "text": "MOCK KEY",
                            "metadata": {
                                "coordinates": {
                                    "points": [
                                        [
                                            503.271484375,
                                            96.3897705078125
                                        ],
                                        [
                                            503.271484375,
                                            107.5164794921875
                                        ],
                                        [
                                            606.103515625,
                                            107.5164794921875
                                        ],
                                        [
                                            606.103515625,
                                            96.3897705078125
                                        ]
                                    ],
                                    "system": "PixelSpace",
                                    "layout_width": 754,
                                    "layout_height": 1000
                                },
                                "page_number": 1
                            }
                        },
                        "layout_element_id": null
                    },
                    "value": {
                        "text": "MOCK VALUE",
                        "custom_element": {
                            "type": "UncategorizedText",
                            "element_id": "MOCK_VALUE_ID",
                            "text": "MOCK VALUE",
                            "metadata": {
                                "coordinates": {
                                    "points": [
                                        [
                                            557.568359375,
                                            124.8626708984375
                                        ],
                                        [
                                            557.568359375,
                                            136.6607666015625
                                        ],
                                        [
                                            595.556640625,
                                            136.6607666015625
                                        ],
                                        [
                                            595.556640625,
                                            124.8626708984375
                                        ]
                                    ],
                                    "system": "PixelSpace",
                                    "layout_width": 754,
                                    "layout_height": 1000
                                },
                                "page_number": 1
                            }
                        },
                        "layout_element_id": null
                    },
                    "confidence": 0.0
                },
                {
                    "key": {
                        "text": "MOCK KEY 2",
                        "custom_element": {
                            "type": "UncategorizedText",
                            "element_id": "MOCK_KEY_ID_2",
                            "text": "MOCK KEY 2",
                            "metadata": {
                                "coordinates": {
                                    "points": [
                                        [
                                            428.52783203125,
                                            124.0478515625
                                        ],
                                        [
                                            428.52783203125,
                                            136.6943359375
                                        ],
                                        [
                                            473.81591796875,
                                            136.6943359375
                                        ],
                                        [
                                            473.81591796875,
                                            124.0478515625
                                        ]
                                    ],
                                    "system": "PixelSpace",
                                    "layout_width": 754,
                                    "layout_height": 1000
                                },
                                "page_number": 1
                            }
                        },
                        "layout_element_id": null
                    },
                    "value": null,
                    "confidence": 0.0
                }
            ],
            "file_directory": "dataset/testing_data/images",
            "filename": "MOCK.png"
        }
    }
 ]
--- a/test_unstructured/documents/test_elements.py
+++ b/test_unstructured/documents/test_elements.py
@ -5,13 +5,14 @@
 from __future__ import annotations
 import copy
 import io
 import json
 import pathlib
 from functools import partial
 import pytest
-from test_unstructured.unit_utils import assign_hash_ids
+from test_unstructured.unit_utils import assign_hash_ids, example_doc_path
 from unstructured.cleaners.core import clean_bullets, clean_prefix
 from unstructured.documents.coordinates import (
    CoordinateSystem,
@ -31,6 +32,7 @@ from unstructured.documents.elements import (
    Title,
    assign_and_map_hash_ids,
 )
 from unstructured.partition.json import partition_json
@pytest.mark.parametrize("element", [Element(), Text(text=""), CheckBox()])
@ -744,3 +746,13 @@ def test_id_to_hash_calculates(text, sequence_number, filename, page_number, exp
    )
    assert element.id_to_hash(sequence_number) == expected_hash, "Returned ID does not match"
    assert element.id == expected_hash, "ID should be set"
 def test_formskeysvalues_reads_saves():
    filename = example_doc_path("test_evaluate_files/unstructured_output/form.json")
    as_read = partition_json(filename=filename)
    tmp_file = io.StringIO()
    json.dump([element.to_dict() for element in as_read], tmp_file)
    tmp_file.seek(0)
    as_read_2 = partition_json(file=tmp_file)
    assert as_read == as_read_2
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.13.8-dev12"  # pragma: no cover
+__version__ = "0.13.8-dev13"  # pragma: no cover
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@ -143,6 +143,18 @@ class Link(TypedDict):
    start_index: int
 class FormKeyOrValue(TypedDict):
    text: str
    layout_element_id: Optional[str]
    custom_element: Optional[Text]
 class FormKeyValuePair(TypedDict):
    key: FormKeyOrValue
    value: Optional[FormKeyOrValue]
    confidence: float
 class ElementMetadata:
    """Fully-dynamic replacement for dataclass-based ElementMetadata."""
@ -176,6 +188,7 @@ class ElementMetadata:
    header_footer_type: Optional[str]
    # -- used in chunks only, when chunk must be split mid-text to fit window --
    is_continuation: Optional[bool]
    key_value_pairs: Optional[list[FormKeyValuePair]]
    languages: Optional[list[str]]
    last_modified: Optional[str]
    link_texts: Optional[list[str]]
@ -327,6 +340,8 @@ class ElementMetadata:
                self.data_source = DataSourceMetadata.from_dict(field_value)
            elif field_name == "orig_elements":
                self.orig_elements = elements_from_base64_gzipped_json(field_value)
            elif field_name == "key_value_pairs":
                self.key_value_pairs = _kvform_rehydrate_internal_elements(field_value)
            else:
                setattr(self, field_name, field_value)
@ -392,6 +407,8 @@ class ElementMetadata:
            meta_dict["data_source"] = self.data_source.to_dict()
        if self.orig_elements is not None:
            meta_dict["orig_elements"] = elements_to_base64_gzipped_json(self.orig_elements)
        if self.key_value_pairs is not None:
            meta_dict["key_value_pairs"] = _kvform_pairs_to_dict(self.key_value_pairs)
        return meta_dict
@ -494,6 +511,7 @@ class ConsolidationStrategy(enum.Enum):
            "text_as_html": cls.FIRST,  # -- only occurs in Table --
            "table_as_cells": cls.FIRST,  # -- only occurs in Table --
            "url": cls.FIRST,
            "key_value_pairs": cls.DROP,  # -- only occurs in FormKeysValues --
        }
@ -660,6 +678,7 @@ class ElementType:
    PAGE_FOOTER = "Page-footer"
    PAGE_NUMBER = "PageNumber"
    CODE_SNIPPET = "CodeSnippet"
    FORM_KEYS_VALUES = "FormKeysValues"
    @classmethod
    def to_dict(cls):
@ -992,6 +1011,12 @@ class PageNumber(Text):
    category = "PageNumber"
 class FormKeysValues(Text):
    """An element for capturing Key-Value dicts (forms)."""
    category = "FormKeysValues"
 TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
    ElementType.TITLE: Title,
    ElementType.SECTION_HEADER: Title,
@ -1029,4 +1054,43 @@ TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
    ElementType.PAGE_BREAK: PageBreak,
    ElementType.CODE_SNIPPET: CodeSnippet,
    ElementType.PAGE_NUMBER: PageNumber,
    ElementType.FORM_KEYS_VALUES: FormKeysValues,
 }
 def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]:
    """
    The key_value_pairs metadata field contains (in the vast majority of cases)
    nested Text elements. Those need to be turned from dicts into Elements explicitly,
    e.g. when partition_json is used.
    """
    from unstructured.staging.base import elements_from_dicts
    # safe to overwrite - deepcopy already happened
    for kv_pair in kv_pairs:
        if kv_pair["key"]["custom_element"] is not None:
            (kv_pair["key"]["custom_element"],) = elements_from_dicts(
                [kv_pair["key"]["custom_element"]]
            )
        if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None:
            (kv_pair["value"]["custom_element"],) = elements_from_dicts(
                [kv_pair["value"]["custom_element"]]
            )
    return kv_pairs
 def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]:
    """
    The key_value_pairs metadata field contains (in the vast majority of cases)
    nested Text elements. Those need to be turned from Elements to dicts recursively,
    e.g. when FormKeysValues.to_dict() is used.
    """
    kv_pairs: list[dict] = copy.deepcopy(kv_pairs)
    for kv_pair in kv_pairs:
        if kv_pair["key"]["custom_element"] is not None:
            kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict()
        if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None:
            kv_pair["value"]["custom_element"] = kv_pair["value"]["custom_element"].to_dict()
    return kv_pairs
--- a/unstructured/partition/image.py
+++ b/unstructured/partition/image.py
@ -6,9 +6,7 @@ from unstructured.chunking import add_chunking_strategy
 from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import add_metadata
 from unstructured.partition.common import exactly_one
-from unstructured.partition.lang import (
+from unstructured.partition.lang import check_language_args
    check_language_args,
 )
 from unstructured.partition.pdf import partition_pdf_or_image
 from unstructured.partition.utils.constants import PartitionStrategy
@ -33,6 +31,8 @@ def partition_image(
    extract_image_block_to_payload: bool = False,
    date_from_file_object: bool = False,
    starting_page_number: int = 1,
    extract_forms: bool = False,
    form_extraction_skip_tables: bool = True,
    **kwargs: Any,
 ) -> list[Element]:
    """Parses an image into a list of interpreted elements.
@ -90,6 +90,11 @@ def partition_image(
    date_from_file_object
        Applies only when providing file via `file` parameter. If this option is True, attempt
        infer last_modified metadata from bytes, otherwise set it to None.
    extract_forms
        Whether the form extraction logic should be run
        (results in adding FormKeysValues elements to output).
    form_extraction_skip_tables
        Whether the form extraction logic should ignore regions designated as Tables.
    """
    exactly_one(filename=filename, file=file)
@ -111,5 +116,7 @@ def partition_image(
        extract_image_block_to_payload=extract_image_block_to_payload,
        date_from_file_object=date_from_file_object,
        starting_page_number=starting_page_number,
        extract_forms=extract_forms,
        form_extraction_skip_tables=form_extraction_skip_tables,
        **kwargs,
    )
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -12,13 +12,7 @@ import numpy as np
 import pdf2image
 import wrapt
 from pdfminer import psparser
-from pdfminer.layout import (
+from pdfminer.layout import LTChar, LTContainer, LTImage, LTItem, LTTextBox
    LTChar,
    LTContainer,
    LTImage,
    LTItem,
    LTTextBox,
 )
 from pdfminer.pdftypes import PDFObjRef
 from pdfminer.utils import open_filename
 from PIL import Image as PILImage
@ -42,10 +36,7 @@ from unstructured.documents.elements import (
    Text,
    process_metadata,
 )
-from unstructured.file_utils.filetype import (
+from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
    FileType,
    add_metadata_with_filetype,
 )
 from unstructured.logger import logger, trace_logger
 from unstructured.nlp.patterns import PARAGRAPH_PATTERN
 from unstructured.partition.common import (
@ -57,10 +48,8 @@ from unstructured.partition.common import (
    ocr_data_to_elements,
    spooled_to_bytes_io_if_needed,
 )
-from unstructured.partition.lang import (
+from unstructured.partition.lang import check_language_args, prepare_languages_for_tesseract
-    check_language_args,
+from unstructured.partition.pdf_image.form_extraction import run_form_extraction
    prepare_languages_for_tesseract,
 )
 from unstructured.partition.pdf_image.pdf_image_utils import (
    annotate_layout_elements,
    check_element_types_to_extract,
@ -85,10 +74,7 @@ from unstructured.partition.utils.constants import (
    OCRMode,
    PartitionStrategy,
 )
-from unstructured.partition.utils.sorting import (
+from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements
    coord_has_valid_points,
    sort_page_elements,
 )
 from unstructured.patches.pdfminer import parse_keyword
 from unstructured.utils import requires_dependencies
@ -135,6 +121,8 @@ def partition_pdf(
    extract_image_block_to_payload: bool = False,
    date_from_file_object: bool = False,
    starting_page_number: int = 1,
    extract_forms: bool = False,
    form_extraction_skip_tables: bool = True,
    **kwargs: Any,
 ) -> list[Element]:
    """Parses a pdf document into a list of interpreted elements.
@ -191,6 +179,11 @@ def partition_pdf(
    date_from_file_object
        Applies only when providing file via `file` parameter. If this option is True, attempt
        infer last_modified metadata from bytes, otherwise set it to None.
    extract_forms
        Whether the form extraction logic should be run
        (results in adding FormKeysValues elements to output).
    form_extraction_skip_tables
        Whether the form extraction logic should ignore regions designated as Tables.
    """
    exactly_one(filename=filename, file=file)
@ -212,6 +205,7 @@ def partition_pdf(
        extract_image_block_to_payload=extract_image_block_to_payload,
        date_from_file_object=date_from_file_object,
        starting_page_number=starting_page_number,
        extract_forms=extract_forms,
        **kwargs,
    )
@ -233,6 +227,8 @@ def partition_pdf_or_image(
    extract_image_block_to_payload: bool = False,
    date_from_file_object: bool = False,
    starting_page_number: int = 1,
    extract_forms: bool = False,
    form_extraction_skip_tables: bool = True,
    **kwargs: Any,
 ) -> list[Element]:
    """Parses a pdf or image document into a list of interpreted elements."""
@ -304,6 +300,8 @@ def partition_pdf_or_image(
                extract_image_block_output_dir=extract_image_block_output_dir,
                extract_image_block_to_payload=extract_image_block_to_payload,
                starting_page_number=starting_page_number,
                extract_forms=extract_forms,
                form_extraction_skip_tables=form_extraction_skip_tables,
                **kwargs,
            )
            out_elements = _process_uncategorized_text_elements(elements)
@ -390,6 +388,8 @@ def _partition_pdf_or_image_local(
    analysis: bool = False,
    analyzed_image_output_dir_path: Optional[str] = None,
    starting_page_number: int = 1,
    extract_forms: bool = False,
    form_extraction_skip_tables: bool = True,
    **kwargs: Any,
 ) -> list[Element]:
    """Partition using package installed locally"""
@ -398,10 +398,7 @@ def _partition_pdf_or_image_local(
        process_file_with_model,
    )
-    from unstructured.partition.pdf_image.ocr import (
+    from unstructured.partition.pdf_image.ocr import process_data_with_ocr, process_file_with_ocr
        process_data_with_ocr,
        process_file_with_ocr,
    )
    from unstructured.partition.pdf_image.pdfminer_processing import (
        process_data_with_pdfminer,
        process_file_with_pdfminer,
@ -581,6 +578,16 @@ def _partition_pdf_or_image_local(
            if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"):
                out_elements.append(cast(Element, el))
    if extract_forms:
        forms = run_form_extraction(
            file=file,
            filename=filename,
            model_name=hi_res_model_name,
            elements=out_elements,
            skip_table_regions=form_extraction_skip_tables,
        )
        out_elements.extend(forms)
    return out_elements
--- a/unstructured/partition/pdf_image/form_extraction.py
+++ b/unstructured/partition/pdf_image/form_extraction.py
@ -0,0 +1,15 @@
 from __future__ import annotations
 from typing import IO
 from unstructured.documents.elements import Element, FormKeysValues
 def run_form_extraction(
    filename: str,
    file: IO[bytes],
    model_name: str,
    elements: list[Element],
    skip_table_regions: bool,
 ) -> list[FormKeysValues]:
    raise NotImplementedError("Form extraction not yet available.")
--- a/unstructured/staging/weaviate.py
+++ b/unstructured/staging/weaviate.py
@ -17,6 +17,7 @@ exclude_metadata_keys = (
    "links",
    "orig_elements",
    "regex_metadata",
    "key_value_pairs",
 )
`@ -1 +1 @@`
	`__version__ = "0.13.8-dev12" # pragma: no cover`	`__version__ = "0.13.8-dev13" # pragma: no cover`