chore: improve kwarg handling (#1810)

Closes `unstructured-inference` issue
[#265](https://github.com/Unstructured-IO/unstructured-inference/issues/265).

Cleaned up the kwarg handling, taking opportunities to turn instances of
handling kwargs as dicts to just using them as normal in function
signatures.

#### Testing:

Should just pass CI.
This commit is contained in:
qued 2023-10-22 23:48:28 -05:00 committed by GitHub
parent 82c8adba3f
commit 7fdddfbc1e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 75 additions and 44 deletions

View File

@ -1,3 +1,11 @@
## 0.10.26-dev0
### Enhancements
### Features
### Fixes
## 0.10.25 ## 0.10.25
### Enhancements ### Enhancements

View File

@ -76,9 +76,9 @@ jsonpatch==1.33
# via langchain # via langchain
jsonpointer==2.4 jsonpointer==2.4
# via jsonpatch # via jsonpatch
langchain==0.0.318 langchain==0.0.320
# via -r requirements/embed-huggingface.in # via -r requirements/embed-huggingface.in
langsmith==0.0.46 langsmith==0.0.49
# via langchain # via langchain
markupsafe==2.1.3 markupsafe==2.1.3
# via jinja2 # via jinja2

View File

@ -6,7 +6,7 @@ pdf2image
pdfminer.six pdfminer.six
# Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded # Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is. # when unstructured library is.
unstructured-inference==0.7.9 unstructured-inference==0.7.10
# unstructured fork of pytesseract that provides an interface to allow for multiple output formats # unstructured fork of pytesseract that provides an interface to allow for multiple output formats
# from one tesseract call # from one tesseract call
unstructured.pytesseract>=0.3.12 unstructured.pytesseract>=0.3.12

View File

@ -203,7 +203,7 @@ sympy==1.12
# via # via
# onnxruntime # onnxruntime
# torch # torch
timm==0.9.7 timm==0.9.8
# via effdet # via effdet
tokenizers==0.14.1 tokenizers==0.14.1
# via transformers # via transformers
@ -236,7 +236,7 @@ typing-extensions==4.8.0
# torch # torch
tzdata==2023.3 tzdata==2023.3
# via pandas # via pandas
unstructured-inference==0.7.9 unstructured-inference==0.7.10
# via -r requirements/extra-pdf-image.in # via -r requirements/extra-pdf-image.in
unstructured-pytesseract==0.3.12 unstructured-pytesseract==0.3.12
# via # via

View File

@ -6,7 +6,7 @@
# #
azure-common==1.1.28 azure-common==1.1.28
# via azure-search-documents # via azure-search-documents
azure-core==1.29.4 azure-core==1.29.5
# via # via
# azure-search-documents # azure-search-documents
# msrest # msrest

View File

@ -14,7 +14,7 @@ async-timeout==4.0.3
# via aiohttp # via aiohttp
attrs==23.1.0 attrs==23.1.0
# via aiohttp # via aiohttp
azure-core==1.29.4 azure-core==1.29.5
# via # via
# adlfs # adlfs
# azure-identity # azure-identity

View File

@ -61,9 +61,9 @@ jsonpatch==1.33
# via langchain # via langchain
jsonpointer==2.4 jsonpointer==2.4
# via jsonpatch # via jsonpatch
langchain==0.0.318 langchain==0.0.320
# via -r requirements/ingest-bedrock.in # via -r requirements/ingest-bedrock.in
langsmith==0.0.46 langsmith==0.0.49
# via langchain # via langchain
marshmallow==3.20.1 marshmallow==3.20.1
# via # via

View File

@ -4,7 +4,7 @@
# #
# pip-compile --constraint=requirements/constraints.in requirements/ingest-confluence.in # pip-compile --constraint=requirements/constraints.in requirements/ingest-confluence.in
# #
atlassian-python-api==3.41.2 atlassian-python-api==3.41.3
# via -r requirements/ingest-confluence.in # via -r requirements/ingest-confluence.in
certifi==2023.7.22 certifi==2023.7.22
# via # via

View File

@ -4,7 +4,7 @@
# #
# pip-compile --constraint=requirements/constraints.in requirements/ingest-jira.in # pip-compile --constraint=requirements/constraints.in requirements/ingest-jira.in
# #
atlassian-python-api==3.41.2 atlassian-python-api==3.41.3
# via -r requirements/ingest-jira.in # via -r requirements/ingest-jira.in
certifi==2023.7.22 certifi==2023.7.22
# via # via

View File

@ -50,9 +50,9 @@ jsonpatch==1.33
# via langchain # via langchain
jsonpointer==2.4 jsonpointer==2.4
# via jsonpatch # via jsonpatch
langchain==0.0.318 langchain==0.0.320
# via -r requirements/ingest-openai.in # via -r requirements/ingest-openai.in
langsmith==0.0.46 langsmith==0.0.49
# via langchain # via langchain
marshmallow==3.20.1 marshmallow==3.20.1
# via # via

View File

@ -103,7 +103,7 @@ requests==2.31.0
# via # via
# -c requirements/base.txt # -c requirements/base.txt
# label-studio-sdk # label-studio-sdk
ruff==0.1.0 ruff==0.1.1
# via -r requirements/test.in # via -r requirements/test.in
six==1.16.0 six==1.16.0
# via # via

View File

@ -180,8 +180,11 @@ def test_partition_pdf_with_model_name_env_var(
mock_process.assert_called_once_with( mock_process.assert_called_once_with(
filename, filename,
is_image=False, is_image=False,
pdf_image_dpi=200, pdf_image_dpi=mock.ANY,
extract_tables=mock.ANY,
model_name="checkbox", model_name="checkbox",
extract_images_in_pdf=mock.ANY,
image_output_dir_path=mock.ANY,
) )
@ -199,8 +202,11 @@ def test_partition_pdf_with_model_name(
mock_process.assert_called_once_with( mock_process.assert_called_once_with(
filename, filename,
is_image=False, is_image=False,
pdf_image_dpi=200, pdf_image_dpi=mock.ANY,
extract_tables=mock.ANY,
model_name="checkbox", model_name="checkbox",
extract_images_in_pdf=mock.ANY,
image_output_dir_path=mock.ANY,
) )
@ -440,8 +446,11 @@ def test_partition_pdf_with_dpi():
mock_process.assert_called_once_with( mock_process.assert_called_once_with(
filename, filename,
is_image=False, is_image=False,
extract_tables=mock.ANY,
model_name=pdf.default_hi_res_model(), model_name=pdf.default_hi_res_model(),
pdf_image_dpi=100, pdf_image_dpi=100,
extract_images_in_pdf=mock.ANY,
image_output_dir_path=mock.ANY,
) )

View File

@ -3,7 +3,7 @@ import os
import pathlib import pathlib
import warnings import warnings
from importlib import import_module from importlib import import_module
from unittest.mock import patch from unittest.mock import ANY, patch
import docx import docx
import pytest import pytest
@ -347,6 +347,8 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
url=None, url=None,
include_page_breaks=False, include_page_breaks=False,
infer_table_structure=False, infer_table_structure=False,
extract_images_in_pdf=ANY,
image_output_dir_path=ANY,
strategy="fast", strategy="fast",
languages=None, languages=None,
) )

View File

@ -1 +1 @@
__version__ = "0.10.25" # pragma: no cover __version__ = "0.10.26-dev0" # pragma: no cover

View File

@ -135,6 +135,8 @@ def partition(
languages: Optional[List[str]] = None, languages: Optional[List[str]] = None,
detect_language_per_element: bool = False, detect_language_per_element: bool = False,
pdf_infer_table_structure: bool = False, pdf_infer_table_structure: bool = False,
pdf_extract_images: bool = False,
pdf_image_output_dir_path: Optional[str] = None,
xml_keep_tags: bool = False, xml_keep_tags: bool = False,
data_source_metadata: Optional[DataSourceMetadata] = None, data_source_metadata: Optional[DataSourceMetadata] = None,
metadata_filename: Optional[str] = None, metadata_filename: Optional[str] = None,
@ -186,6 +188,12 @@ def partition(
additional metadata field, "text_as_html," where the value (string) is a just a additional metadata field, "text_as_html," where the value (string) is a just a
transformation of the data into an HTML <table>. transformation of the data into an HTML <table>.
The "text" field for a partitioned Table Element is always present, whether True or False. The "text" field for a partitioned Table Element is always present, whether True or False.
pdf_extract_images
If True and strategy=hi_res, any detected images will be saved in the path specified by
pdf_image_output_dir_path.
pdf_image_output_dir_path
If pdf_extract_images=True and strategy=hi_res, any detected images will be saved in the
given path
xml_keep_tags xml_keep_tags
If True, will retain the XML tags in the output. Otherwise it will simply extract If True, will retain the XML tags in the output. Otherwise it will simply extract
the text from within the tags. Only applies to partition_xml. the text from within the tags. Only applies to partition_xml.
@ -367,6 +375,8 @@ def partition(
infer_table_structure=infer_table_structure, infer_table_structure=infer_table_structure,
strategy=strategy, strategy=strategy,
languages=languages, languages=languages,
extract_images_in_pdf=pdf_extract_images,
image_output_dir_path=pdf_image_output_dir_path,
**kwargs, **kwargs,
) )
elif (filetype == FileType.PNG) or (filetype == FileType.JPG) or (filetype == FileType.TIFF): elif (filetype == FileType.PNG) or (filetype == FileType.JPG) or (filetype == FileType.TIFF):

View File

@ -35,10 +35,7 @@ from unstructured.documents.elements import (
) )
from unstructured.logger import logger from unstructured.logger import logger
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
from unstructured.partition.utils.constants import ( from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT
SORT_MODE_DONT,
SORT_MODE_XY_CUT,
)
from unstructured.utils import dependency_exists, first from unstructured.utils import dependency_exists, first
if dependency_exists("docx") and dependency_exists("docx.table"): if dependency_exists("docx") and dependency_exists("docx.table"):
@ -551,11 +548,11 @@ def document_to_element_list(
infer_list_items: bool = True, infer_list_items: bool = True,
source_format: Optional[str] = None, source_format: Optional[str] = None,
detection_origin: Optional[str] = None, detection_origin: Optional[str] = None,
sort_mode: str = SORT_MODE_XY_CUT,
**kwargs, **kwargs,
) -> List[Element]: ) -> List[Element]:
"""Converts a DocumentLayout object to a list of unstructured elements.""" """Converts a DocumentLayout object to a list of unstructured elements."""
elements: List[Element] = [] elements: List[Element] = []
sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)
num_pages = len(document.pages) num_pages = len(document.pages)
for i, page in enumerate(document.pages): for i, page in enumerate(document.pages):

View File

@ -100,6 +100,8 @@ def partition_pdf(
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
chunking_strategy: Optional[str] = None, chunking_strategy: Optional[str] = None,
links: Sequence[Link] = [], links: Sequence[Link] = [],
extract_images_in_pdf: bool = False,
image_output_dir_path: Optional[str] = None,
**kwargs, **kwargs,
) -> List[Element]: ) -> List[Element]:
"""Parses a pdf document into a list of interpreted elements. """Parses a pdf document into a list of interpreted elements.
@ -135,6 +137,12 @@ def partition_pdf(
processing text/plain content. processing text/plain content.
metadata_last_modified metadata_last_modified
The last modified date for the document. The last modified date for the document.
extract_images_in_pdf
If True and strategy=hi_res, any detected images will be saved in the path specified by
image_output_dir_path.
image_output_dir_path
If extract_images_in_pdf=True and strategy=hi_res, any detected images will be saved in the
given path
""" """
exactly_one(filename=filename, file=file) exactly_one(filename=filename, file=file)
@ -164,6 +172,8 @@ def partition_pdf(
max_partition=max_partition, max_partition=max_partition,
min_partition=min_partition, min_partition=min_partition,
metadata_last_modified=metadata_last_modified, metadata_last_modified=metadata_last_modified,
extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
**kwargs, **kwargs,
) )
@ -210,6 +220,8 @@ def partition_pdf_or_image(
max_partition: Optional[int] = 1500, max_partition: Optional[int] = 1500,
min_partition: Optional[int] = 0, min_partition: Optional[int] = 0,
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
extract_images_in_pdf: bool = False,
image_output_dir_path: Optional[str] = None,
**kwargs, **kwargs,
) -> List[Element]: ) -> List[Element]:
"""Parses a pdf or image document into a list of interpreted elements.""" """Parses a pdf or image document into a list of interpreted elements."""
@ -292,6 +304,8 @@ def partition_pdf_or_image(
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
languages=languages, languages=languages,
metadata_last_modified=metadata_last_modified or last_modification_date, metadata_last_modified=metadata_last_modified or last_modification_date,
extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
**kwargs, **kwargs,
) )
layout_elements = [] layout_elements = []
@ -334,6 +348,9 @@ def _partition_pdf_or_image_local(
ocr_mode: str = OCRMode.FULL_PAGE.value, ocr_mode: str = OCRMode.FULL_PAGE.value,
model_name: Optional[str] = None, model_name: Optional[str] = None,
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
extract_images_in_pdf: bool = False,
image_output_dir_path: Optional[str] = None,
pdf_image_dpi: Optional[int] = None,
**kwargs, **kwargs,
) -> List[Element]: ) -> List[Element]:
"""Partition using package installed locally.""" """Partition using package installed locally."""
@ -350,7 +367,6 @@ def _partition_pdf_or_image_local(
ocr_languages = prepare_languages_for_tesseract(languages) ocr_languages = prepare_languages_for_tesseract(languages)
model_name = model_name or default_hi_res_model() model_name = model_name or default_hi_res_model()
pdf_image_dpi = kwargs.pop("pdf_image_dpi", None)
if pdf_image_dpi is None: if pdf_image_dpi is None:
pdf_image_dpi = 300 if model_name == "chipper" else 200 pdf_image_dpi = 300 if model_name == "chipper" else 200
if (pdf_image_dpi < 300) and (model_name == "chipper"): if (pdf_image_dpi < 300) and (model_name == "chipper"):
@ -359,27 +375,16 @@ def _partition_pdf_or_image_local(
f"(currently {pdf_image_dpi}).", f"(currently {pdf_image_dpi}).",
) )
# NOTE(christine): Need to extract images from PDF's
extract_images_in_pdf = kwargs.get("extract_images_in_pdf", False)
image_output_dir_path = kwargs.get("image_output_dir_path", None)
process_with_model_extra_kwargs = {
"extract_images_in_pdf": extract_images_in_pdf,
"image_output_dir_path": image_output_dir_path,
}
process_with_model_kwargs = {}
for key, value in process_with_model_extra_kwargs.items():
if value:
process_with_model_kwargs[key] = value
if file is None: if file is None:
# NOTE(christine): out_layout = extracted_layout + inferred_layout # NOTE(christine): out_layout = extracted_layout + inferred_layout
out_layout = process_file_with_model( out_layout = process_file_with_model(
filename, filename,
is_image=is_image, is_image=is_image,
extract_tables=infer_table_structure,
model_name=model_name, model_name=model_name,
pdf_image_dpi=pdf_image_dpi, pdf_image_dpi=pdf_image_dpi,
**process_with_model_kwargs, extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
) )
if model_name.startswith("chipper"): if model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper # NOTE(alan): We shouldn't do OCR with chipper
@ -398,9 +403,11 @@ def _partition_pdf_or_image_local(
out_layout = process_data_with_model( out_layout = process_data_with_model(
file, file,
is_image=is_image, is_image=is_image,
extract_tables=infer_table_structure,
model_name=model_name, model_name=model_name,
pdf_image_dpi=pdf_image_dpi, pdf_image_dpi=pdf_image_dpi,
**process_with_model_kwargs, extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
) )
if model_name.startswith("chipper"): if model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper # NOTE(alan): We shouldn't do OCR with chipper
@ -528,11 +535,11 @@ def _process_pdfminer_pages(
filename: str = "", filename: str = "",
include_page_breaks: bool = False, include_page_breaks: bool = False,
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
sort_mode: str = SORT_MODE_XY_CUT,
**kwargs, **kwargs,
): ):
"""Uses PDF miner to split a document into pages and process them.""" """Uses PDF miner to split a document into pages and process them."""
elements: List[Element] = [] elements: List[Element] = []
sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)
rsrcmgr = PDFResourceManager() rsrcmgr = PDFResourceManager()
laparams = LAParams() laparams = LAParams()

View File

@ -11,6 +11,7 @@ SORT_MODE_XY_CUT = "xy-cut"
SORT_MODE_BASIC = "basic" SORT_MODE_BASIC = "basic"
SORT_MODE_DONT = "dont" SORT_MODE_DONT = "dont"
SUBREGION_THRESHOLD_FOR_OCR = 0.5 SUBREGION_THRESHOLD_FOR_OCR = 0.5
UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False) UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)

View File

@ -5,10 +5,7 @@ import numpy as np
from unstructured.documents.elements import CoordinatesMetadata, Element from unstructured.documents.elements import CoordinatesMetadata, Element
from unstructured.logger import trace_logger from unstructured.logger import trace_logger
from unstructured.partition.utils.constants import ( from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
SORT_MODE_BASIC,
SORT_MODE_XY_CUT,
)
from unstructured.partition.utils.xycut import recursive_xy_cut, recursive_xy_cut_swapped from unstructured.partition.utils.xycut import recursive_xy_cut, recursive_xy_cut_swapped