mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-01 21:04:06 +00:00
Feat: form parsing placeholders (#3034)
Allows introduction of form extraction in the future - sets up the FormKeysValues element & format, puts in an empty function call in the partition_pdf_or_image pipeline.
This commit is contained in:
parent
1fb0fe5cf5
commit
e6ada05c55
@ -1,4 +1,4 @@
|
|||||||
## 0.13.8-dev12
|
## 0.13.8-dev13
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -7,6 +7,7 @@
|
|||||||
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
|
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
* **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`.
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
149
example-docs/test_evaluate_files/unstructured_output/form.json
Normal file
149
example-docs/test_evaluate_files/unstructured_output/form.json
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "FormKeysValues",
|
||||||
|
"element_id": "MOCK_FORM_ID",
|
||||||
|
"text": "",
|
||||||
|
"metadata": {
|
||||||
|
"coordinates": {
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
35.15625,
|
||||||
|
95.556640625
|
||||||
|
],
|
||||||
|
[
|
||||||
|
710.357666015625,
|
||||||
|
95.556640625
|
||||||
|
],
|
||||||
|
[
|
||||||
|
710.357666015625,
|
||||||
|
887.890625
|
||||||
|
],
|
||||||
|
[
|
||||||
|
35.15625,
|
||||||
|
887.890625
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"system": "PixelSpace",
|
||||||
|
"layout_width": 754,
|
||||||
|
"layout_height": 1000
|
||||||
|
},
|
||||||
|
"page_number": 1,
|
||||||
|
"key_value_pairs": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"text": "MOCK KEY",
|
||||||
|
"custom_element": {
|
||||||
|
"type": "UncategorizedText",
|
||||||
|
"element_id": "MOCK_KEY_ID_1",
|
||||||
|
"text": "MOCK KEY",
|
||||||
|
"metadata": {
|
||||||
|
"coordinates": {
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
503.271484375,
|
||||||
|
96.3897705078125
|
||||||
|
],
|
||||||
|
[
|
||||||
|
503.271484375,
|
||||||
|
107.5164794921875
|
||||||
|
],
|
||||||
|
[
|
||||||
|
606.103515625,
|
||||||
|
107.5164794921875
|
||||||
|
],
|
||||||
|
[
|
||||||
|
606.103515625,
|
||||||
|
96.3897705078125
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"system": "PixelSpace",
|
||||||
|
"layout_width": 754,
|
||||||
|
"layout_height": 1000
|
||||||
|
},
|
||||||
|
"page_number": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"layout_element_id": null
|
||||||
|
},
|
||||||
|
"value": {
|
||||||
|
"text": "MOCK VALUE",
|
||||||
|
"custom_element": {
|
||||||
|
"type": "UncategorizedText",
|
||||||
|
"element_id": "MOCK_VALUE_ID",
|
||||||
|
"text": "MOCK VALUE",
|
||||||
|
"metadata": {
|
||||||
|
"coordinates": {
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
557.568359375,
|
||||||
|
124.8626708984375
|
||||||
|
],
|
||||||
|
[
|
||||||
|
557.568359375,
|
||||||
|
136.6607666015625
|
||||||
|
],
|
||||||
|
[
|
||||||
|
595.556640625,
|
||||||
|
136.6607666015625
|
||||||
|
],
|
||||||
|
[
|
||||||
|
595.556640625,
|
||||||
|
124.8626708984375
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"system": "PixelSpace",
|
||||||
|
"layout_width": 754,
|
||||||
|
"layout_height": 1000
|
||||||
|
},
|
||||||
|
"page_number": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"layout_element_id": null
|
||||||
|
},
|
||||||
|
"confidence": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"text": "MOCK KEY 2",
|
||||||
|
"custom_element": {
|
||||||
|
"type": "UncategorizedText",
|
||||||
|
"element_id": "MOCK_KEY_ID_2",
|
||||||
|
"text": "MOCK KEY 2",
|
||||||
|
"metadata": {
|
||||||
|
"coordinates": {
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
428.52783203125,
|
||||||
|
124.0478515625
|
||||||
|
],
|
||||||
|
[
|
||||||
|
428.52783203125,
|
||||||
|
136.6943359375
|
||||||
|
],
|
||||||
|
[
|
||||||
|
473.81591796875,
|
||||||
|
136.6943359375
|
||||||
|
],
|
||||||
|
[
|
||||||
|
473.81591796875,
|
||||||
|
124.0478515625
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"system": "PixelSpace",
|
||||||
|
"layout_width": 754,
|
||||||
|
"layout_height": 1000
|
||||||
|
},
|
||||||
|
"page_number": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"layout_element_id": null
|
||||||
|
},
|
||||||
|
"value": null,
|
||||||
|
"confidence": 0.0
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"file_directory": "dataset/testing_data/images",
|
||||||
|
"filename": "MOCK.png"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
@ -5,13 +5,14 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
|
import io
|
||||||
import json
|
import json
|
||||||
import pathlib
|
import pathlib
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from test_unstructured.unit_utils import assign_hash_ids
|
from test_unstructured.unit_utils import assign_hash_ids, example_doc_path
|
||||||
from unstructured.cleaners.core import clean_bullets, clean_prefix
|
from unstructured.cleaners.core import clean_bullets, clean_prefix
|
||||||
from unstructured.documents.coordinates import (
|
from unstructured.documents.coordinates import (
|
||||||
CoordinateSystem,
|
CoordinateSystem,
|
||||||
@ -31,6 +32,7 @@ from unstructured.documents.elements import (
|
|||||||
Title,
|
Title,
|
||||||
assign_and_map_hash_ids,
|
assign_and_map_hash_ids,
|
||||||
)
|
)
|
||||||
|
from unstructured.partition.json import partition_json
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("element", [Element(), Text(text=""), CheckBox()])
|
@pytest.mark.parametrize("element", [Element(), Text(text=""), CheckBox()])
|
||||||
@ -744,3 +746,13 @@ def test_id_to_hash_calculates(text, sequence_number, filename, page_number, exp
|
|||||||
)
|
)
|
||||||
assert element.id_to_hash(sequence_number) == expected_hash, "Returned ID does not match"
|
assert element.id_to_hash(sequence_number) == expected_hash, "Returned ID does not match"
|
||||||
assert element.id == expected_hash, "ID should be set"
|
assert element.id == expected_hash, "ID should be set"
|
||||||
|
|
||||||
|
|
||||||
|
def test_formskeysvalues_reads_saves():
|
||||||
|
filename = example_doc_path("test_evaluate_files/unstructured_output/form.json")
|
||||||
|
as_read = partition_json(filename=filename)
|
||||||
|
tmp_file = io.StringIO()
|
||||||
|
json.dump([element.to_dict() for element in as_read], tmp_file)
|
||||||
|
tmp_file.seek(0)
|
||||||
|
as_read_2 = partition_json(file=tmp_file)
|
||||||
|
assert as_read == as_read_2
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.13.8-dev12" # pragma: no cover
|
__version__ = "0.13.8-dev13" # pragma: no cover
|
||||||
|
@ -143,6 +143,18 @@ class Link(TypedDict):
|
|||||||
start_index: int
|
start_index: int
|
||||||
|
|
||||||
|
|
||||||
|
class FormKeyOrValue(TypedDict):
|
||||||
|
text: str
|
||||||
|
layout_element_id: Optional[str]
|
||||||
|
custom_element: Optional[Text]
|
||||||
|
|
||||||
|
|
||||||
|
class FormKeyValuePair(TypedDict):
|
||||||
|
key: FormKeyOrValue
|
||||||
|
value: Optional[FormKeyOrValue]
|
||||||
|
confidence: float
|
||||||
|
|
||||||
|
|
||||||
class ElementMetadata:
|
class ElementMetadata:
|
||||||
"""Fully-dynamic replacement for dataclass-based ElementMetadata."""
|
"""Fully-dynamic replacement for dataclass-based ElementMetadata."""
|
||||||
|
|
||||||
@ -176,6 +188,7 @@ class ElementMetadata:
|
|||||||
header_footer_type: Optional[str]
|
header_footer_type: Optional[str]
|
||||||
# -- used in chunks only, when chunk must be split mid-text to fit window --
|
# -- used in chunks only, when chunk must be split mid-text to fit window --
|
||||||
is_continuation: Optional[bool]
|
is_continuation: Optional[bool]
|
||||||
|
key_value_pairs: Optional[list[FormKeyValuePair]]
|
||||||
languages: Optional[list[str]]
|
languages: Optional[list[str]]
|
||||||
last_modified: Optional[str]
|
last_modified: Optional[str]
|
||||||
link_texts: Optional[list[str]]
|
link_texts: Optional[list[str]]
|
||||||
@ -327,6 +340,8 @@ class ElementMetadata:
|
|||||||
self.data_source = DataSourceMetadata.from_dict(field_value)
|
self.data_source = DataSourceMetadata.from_dict(field_value)
|
||||||
elif field_name == "orig_elements":
|
elif field_name == "orig_elements":
|
||||||
self.orig_elements = elements_from_base64_gzipped_json(field_value)
|
self.orig_elements = elements_from_base64_gzipped_json(field_value)
|
||||||
|
elif field_name == "key_value_pairs":
|
||||||
|
self.key_value_pairs = _kvform_rehydrate_internal_elements(field_value)
|
||||||
else:
|
else:
|
||||||
setattr(self, field_name, field_value)
|
setattr(self, field_name, field_value)
|
||||||
|
|
||||||
@ -392,6 +407,8 @@ class ElementMetadata:
|
|||||||
meta_dict["data_source"] = self.data_source.to_dict()
|
meta_dict["data_source"] = self.data_source.to_dict()
|
||||||
if self.orig_elements is not None:
|
if self.orig_elements is not None:
|
||||||
meta_dict["orig_elements"] = elements_to_base64_gzipped_json(self.orig_elements)
|
meta_dict["orig_elements"] = elements_to_base64_gzipped_json(self.orig_elements)
|
||||||
|
if self.key_value_pairs is not None:
|
||||||
|
meta_dict["key_value_pairs"] = _kvform_pairs_to_dict(self.key_value_pairs)
|
||||||
|
|
||||||
return meta_dict
|
return meta_dict
|
||||||
|
|
||||||
@ -494,6 +511,7 @@ class ConsolidationStrategy(enum.Enum):
|
|||||||
"text_as_html": cls.FIRST, # -- only occurs in Table --
|
"text_as_html": cls.FIRST, # -- only occurs in Table --
|
||||||
"table_as_cells": cls.FIRST, # -- only occurs in Table --
|
"table_as_cells": cls.FIRST, # -- only occurs in Table --
|
||||||
"url": cls.FIRST,
|
"url": cls.FIRST,
|
||||||
|
"key_value_pairs": cls.DROP, # -- only occurs in FormKeysValues --
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -660,6 +678,7 @@ class ElementType:
|
|||||||
PAGE_FOOTER = "Page-footer"
|
PAGE_FOOTER = "Page-footer"
|
||||||
PAGE_NUMBER = "PageNumber"
|
PAGE_NUMBER = "PageNumber"
|
||||||
CODE_SNIPPET = "CodeSnippet"
|
CODE_SNIPPET = "CodeSnippet"
|
||||||
|
FORM_KEYS_VALUES = "FormKeysValues"
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def to_dict(cls):
|
def to_dict(cls):
|
||||||
@ -992,6 +1011,12 @@ class PageNumber(Text):
|
|||||||
category = "PageNumber"
|
category = "PageNumber"
|
||||||
|
|
||||||
|
|
||||||
|
class FormKeysValues(Text):
|
||||||
|
"""An element for capturing Key-Value dicts (forms)."""
|
||||||
|
|
||||||
|
category = "FormKeysValues"
|
||||||
|
|
||||||
|
|
||||||
TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
|
TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
|
||||||
ElementType.TITLE: Title,
|
ElementType.TITLE: Title,
|
||||||
ElementType.SECTION_HEADER: Title,
|
ElementType.SECTION_HEADER: Title,
|
||||||
@ -1029,4 +1054,43 @@ TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
|
|||||||
ElementType.PAGE_BREAK: PageBreak,
|
ElementType.PAGE_BREAK: PageBreak,
|
||||||
ElementType.CODE_SNIPPET: CodeSnippet,
|
ElementType.CODE_SNIPPET: CodeSnippet,
|
||||||
ElementType.PAGE_NUMBER: PageNumber,
|
ElementType.PAGE_NUMBER: PageNumber,
|
||||||
|
ElementType.FORM_KEYS_VALUES: FormKeysValues,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]:
|
||||||
|
"""
|
||||||
|
The key_value_pairs metadata field contains (in the vast majority of cases)
|
||||||
|
nested Text elements. Those need to be turned from dicts into Elements explicitly,
|
||||||
|
e.g. when partition_json is used.
|
||||||
|
"""
|
||||||
|
from unstructured.staging.base import elements_from_dicts
|
||||||
|
|
||||||
|
# safe to overwrite - deepcopy already happened
|
||||||
|
for kv_pair in kv_pairs:
|
||||||
|
if kv_pair["key"]["custom_element"] is not None:
|
||||||
|
(kv_pair["key"]["custom_element"],) = elements_from_dicts(
|
||||||
|
[kv_pair["key"]["custom_element"]]
|
||||||
|
)
|
||||||
|
if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None:
|
||||||
|
(kv_pair["value"]["custom_element"],) = elements_from_dicts(
|
||||||
|
[kv_pair["value"]["custom_element"]]
|
||||||
|
)
|
||||||
|
return kv_pairs
|
||||||
|
|
||||||
|
|
||||||
|
def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]:
|
||||||
|
"""
|
||||||
|
The key_value_pairs metadata field contains (in the vast majority of cases)
|
||||||
|
nested Text elements. Those need to be turned from Elements to dicts recursively,
|
||||||
|
e.g. when FormKeysValues.to_dict() is used.
|
||||||
|
|
||||||
|
"""
|
||||||
|
kv_pairs: list[dict] = copy.deepcopy(kv_pairs)
|
||||||
|
for kv_pair in kv_pairs:
|
||||||
|
if kv_pair["key"]["custom_element"] is not None:
|
||||||
|
kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict()
|
||||||
|
if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None:
|
||||||
|
kv_pair["value"]["custom_element"] = kv_pair["value"]["custom_element"].to_dict()
|
||||||
|
|
||||||
|
return kv_pairs
|
||||||
|
@ -6,9 +6,7 @@ from unstructured.chunking import add_chunking_strategy
|
|||||||
from unstructured.documents.elements import Element, process_metadata
|
from unstructured.documents.elements import Element, process_metadata
|
||||||
from unstructured.file_utils.filetype import add_metadata
|
from unstructured.file_utils.filetype import add_metadata
|
||||||
from unstructured.partition.common import exactly_one
|
from unstructured.partition.common import exactly_one
|
||||||
from unstructured.partition.lang import (
|
from unstructured.partition.lang import check_language_args
|
||||||
check_language_args,
|
|
||||||
)
|
|
||||||
from unstructured.partition.pdf import partition_pdf_or_image
|
from unstructured.partition.pdf import partition_pdf_or_image
|
||||||
from unstructured.partition.utils.constants import PartitionStrategy
|
from unstructured.partition.utils.constants import PartitionStrategy
|
||||||
|
|
||||||
@ -33,6 +31,8 @@ def partition_image(
|
|||||||
extract_image_block_to_payload: bool = False,
|
extract_image_block_to_payload: bool = False,
|
||||||
date_from_file_object: bool = False,
|
date_from_file_object: bool = False,
|
||||||
starting_page_number: int = 1,
|
starting_page_number: int = 1,
|
||||||
|
extract_forms: bool = False,
|
||||||
|
form_extraction_skip_tables: bool = True,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[Element]:
|
) -> list[Element]:
|
||||||
"""Parses an image into a list of interpreted elements.
|
"""Parses an image into a list of interpreted elements.
|
||||||
@ -90,6 +90,11 @@ def partition_image(
|
|||||||
date_from_file_object
|
date_from_file_object
|
||||||
Applies only when providing file via `file` parameter. If this option is True, attempt
|
Applies only when providing file via `file` parameter. If this option is True, attempt
|
||||||
infer last_modified metadata from bytes, otherwise set it to None.
|
infer last_modified metadata from bytes, otherwise set it to None.
|
||||||
|
extract_forms
|
||||||
|
Whether the form extraction logic should be run
|
||||||
|
(results in adding FormKeysValues elements to output).
|
||||||
|
form_extraction_skip_tables
|
||||||
|
Whether the form extraction logic should ignore regions designated as Tables.
|
||||||
"""
|
"""
|
||||||
exactly_one(filename=filename, file=file)
|
exactly_one(filename=filename, file=file)
|
||||||
|
|
||||||
@ -111,5 +116,7 @@ def partition_image(
|
|||||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||||
date_from_file_object=date_from_file_object,
|
date_from_file_object=date_from_file_object,
|
||||||
starting_page_number=starting_page_number,
|
starting_page_number=starting_page_number,
|
||||||
|
extract_forms=extract_forms,
|
||||||
|
form_extraction_skip_tables=form_extraction_skip_tables,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
@ -12,13 +12,7 @@ import numpy as np
|
|||||||
import pdf2image
|
import pdf2image
|
||||||
import wrapt
|
import wrapt
|
||||||
from pdfminer import psparser
|
from pdfminer import psparser
|
||||||
from pdfminer.layout import (
|
from pdfminer.layout import LTChar, LTContainer, LTImage, LTItem, LTTextBox
|
||||||
LTChar,
|
|
||||||
LTContainer,
|
|
||||||
LTImage,
|
|
||||||
LTItem,
|
|
||||||
LTTextBox,
|
|
||||||
)
|
|
||||||
from pdfminer.pdftypes import PDFObjRef
|
from pdfminer.pdftypes import PDFObjRef
|
||||||
from pdfminer.utils import open_filename
|
from pdfminer.utils import open_filename
|
||||||
from PIL import Image as PILImage
|
from PIL import Image as PILImage
|
||||||
@ -42,10 +36,7 @@ from unstructured.documents.elements import (
|
|||||||
Text,
|
Text,
|
||||||
process_metadata,
|
process_metadata,
|
||||||
)
|
)
|
||||||
from unstructured.file_utils.filetype import (
|
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||||
FileType,
|
|
||||||
add_metadata_with_filetype,
|
|
||||||
)
|
|
||||||
from unstructured.logger import logger, trace_logger
|
from unstructured.logger import logger, trace_logger
|
||||||
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
|
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common import (
|
||||||
@ -57,10 +48,8 @@ from unstructured.partition.common import (
|
|||||||
ocr_data_to_elements,
|
ocr_data_to_elements,
|
||||||
spooled_to_bytes_io_if_needed,
|
spooled_to_bytes_io_if_needed,
|
||||||
)
|
)
|
||||||
from unstructured.partition.lang import (
|
from unstructured.partition.lang import check_language_args, prepare_languages_for_tesseract
|
||||||
check_language_args,
|
from unstructured.partition.pdf_image.form_extraction import run_form_extraction
|
||||||
prepare_languages_for_tesseract,
|
|
||||||
)
|
|
||||||
from unstructured.partition.pdf_image.pdf_image_utils import (
|
from unstructured.partition.pdf_image.pdf_image_utils import (
|
||||||
annotate_layout_elements,
|
annotate_layout_elements,
|
||||||
check_element_types_to_extract,
|
check_element_types_to_extract,
|
||||||
@ -85,10 +74,7 @@ from unstructured.partition.utils.constants import (
|
|||||||
OCRMode,
|
OCRMode,
|
||||||
PartitionStrategy,
|
PartitionStrategy,
|
||||||
)
|
)
|
||||||
from unstructured.partition.utils.sorting import (
|
from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements
|
||||||
coord_has_valid_points,
|
|
||||||
sort_page_elements,
|
|
||||||
)
|
|
||||||
from unstructured.patches.pdfminer import parse_keyword
|
from unstructured.patches.pdfminer import parse_keyword
|
||||||
from unstructured.utils import requires_dependencies
|
from unstructured.utils import requires_dependencies
|
||||||
|
|
||||||
@ -135,6 +121,8 @@ def partition_pdf(
|
|||||||
extract_image_block_to_payload: bool = False,
|
extract_image_block_to_payload: bool = False,
|
||||||
date_from_file_object: bool = False,
|
date_from_file_object: bool = False,
|
||||||
starting_page_number: int = 1,
|
starting_page_number: int = 1,
|
||||||
|
extract_forms: bool = False,
|
||||||
|
form_extraction_skip_tables: bool = True,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[Element]:
|
) -> list[Element]:
|
||||||
"""Parses a pdf document into a list of interpreted elements.
|
"""Parses a pdf document into a list of interpreted elements.
|
||||||
@ -191,6 +179,11 @@ def partition_pdf(
|
|||||||
date_from_file_object
|
date_from_file_object
|
||||||
Applies only when providing file via `file` parameter. If this option is True, attempt
|
Applies only when providing file via `file` parameter. If this option is True, attempt
|
||||||
infer last_modified metadata from bytes, otherwise set it to None.
|
infer last_modified metadata from bytes, otherwise set it to None.
|
||||||
|
extract_forms
|
||||||
|
Whether the form extraction logic should be run
|
||||||
|
(results in adding FormKeysValues elements to output).
|
||||||
|
form_extraction_skip_tables
|
||||||
|
Whether the form extraction logic should ignore regions designated as Tables.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
exactly_one(filename=filename, file=file)
|
exactly_one(filename=filename, file=file)
|
||||||
@ -212,6 +205,7 @@ def partition_pdf(
|
|||||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||||
date_from_file_object=date_from_file_object,
|
date_from_file_object=date_from_file_object,
|
||||||
starting_page_number=starting_page_number,
|
starting_page_number=starting_page_number,
|
||||||
|
extract_forms=extract_forms,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -233,6 +227,8 @@ def partition_pdf_or_image(
|
|||||||
extract_image_block_to_payload: bool = False,
|
extract_image_block_to_payload: bool = False,
|
||||||
date_from_file_object: bool = False,
|
date_from_file_object: bool = False,
|
||||||
starting_page_number: int = 1,
|
starting_page_number: int = 1,
|
||||||
|
extract_forms: bool = False,
|
||||||
|
form_extraction_skip_tables: bool = True,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[Element]:
|
) -> list[Element]:
|
||||||
"""Parses a pdf or image document into a list of interpreted elements."""
|
"""Parses a pdf or image document into a list of interpreted elements."""
|
||||||
@ -304,6 +300,8 @@ def partition_pdf_or_image(
|
|||||||
extract_image_block_output_dir=extract_image_block_output_dir,
|
extract_image_block_output_dir=extract_image_block_output_dir,
|
||||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||||
starting_page_number=starting_page_number,
|
starting_page_number=starting_page_number,
|
||||||
|
extract_forms=extract_forms,
|
||||||
|
form_extraction_skip_tables=form_extraction_skip_tables,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
out_elements = _process_uncategorized_text_elements(elements)
|
out_elements = _process_uncategorized_text_elements(elements)
|
||||||
@ -390,6 +388,8 @@ def _partition_pdf_or_image_local(
|
|||||||
analysis: bool = False,
|
analysis: bool = False,
|
||||||
analyzed_image_output_dir_path: Optional[str] = None,
|
analyzed_image_output_dir_path: Optional[str] = None,
|
||||||
starting_page_number: int = 1,
|
starting_page_number: int = 1,
|
||||||
|
extract_forms: bool = False,
|
||||||
|
form_extraction_skip_tables: bool = True,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[Element]:
|
) -> list[Element]:
|
||||||
"""Partition using package installed locally"""
|
"""Partition using package installed locally"""
|
||||||
@ -398,10 +398,7 @@ def _partition_pdf_or_image_local(
|
|||||||
process_file_with_model,
|
process_file_with_model,
|
||||||
)
|
)
|
||||||
|
|
||||||
from unstructured.partition.pdf_image.ocr import (
|
from unstructured.partition.pdf_image.ocr import process_data_with_ocr, process_file_with_ocr
|
||||||
process_data_with_ocr,
|
|
||||||
process_file_with_ocr,
|
|
||||||
)
|
|
||||||
from unstructured.partition.pdf_image.pdfminer_processing import (
|
from unstructured.partition.pdf_image.pdfminer_processing import (
|
||||||
process_data_with_pdfminer,
|
process_data_with_pdfminer,
|
||||||
process_file_with_pdfminer,
|
process_file_with_pdfminer,
|
||||||
@ -581,6 +578,16 @@ def _partition_pdf_or_image_local(
|
|||||||
if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"):
|
if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"):
|
||||||
out_elements.append(cast(Element, el))
|
out_elements.append(cast(Element, el))
|
||||||
|
|
||||||
|
if extract_forms:
|
||||||
|
forms = run_form_extraction(
|
||||||
|
file=file,
|
||||||
|
filename=filename,
|
||||||
|
model_name=hi_res_model_name,
|
||||||
|
elements=out_elements,
|
||||||
|
skip_table_regions=form_extraction_skip_tables,
|
||||||
|
)
|
||||||
|
out_elements.extend(forms)
|
||||||
|
|
||||||
return out_elements
|
return out_elements
|
||||||
|
|
||||||
|
|
||||||
|
15
unstructured/partition/pdf_image/form_extraction.py
Normal file
15
unstructured/partition/pdf_image/form_extraction.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import IO
|
||||||
|
|
||||||
|
from unstructured.documents.elements import Element, FormKeysValues
|
||||||
|
|
||||||
|
|
||||||
|
def run_form_extraction(
|
||||||
|
filename: str,
|
||||||
|
file: IO[bytes],
|
||||||
|
model_name: str,
|
||||||
|
elements: list[Element],
|
||||||
|
skip_table_regions: bool,
|
||||||
|
) -> list[FormKeysValues]:
|
||||||
|
raise NotImplementedError("Form extraction not yet available.")
|
@ -17,6 +17,7 @@ exclude_metadata_keys = (
|
|||||||
"links",
|
"links",
|
||||||
"orig_elements",
|
"orig_elements",
|
||||||
"regex_metadata",
|
"regex_metadata",
|
||||||
|
"key_value_pairs",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user