mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-03 03:23:25 +00:00
chore: improve kwarg handling (#1810)
Closes `unstructured-inference` issue [#265](https://github.com/Unstructured-IO/unstructured-inference/issues/265). Cleaned up the kwarg handling, taking opportunities to turn instances of handling kwargs as dicts to just using them as normal in function signatures. #### Testing: Should just pass CI.
This commit is contained in:
parent
82c8adba3f
commit
7fdddfbc1e
@ -1,3 +1,11 @@
|
||||
## 0.10.26-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.10.25
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -76,9 +76,9 @@ jsonpatch==1.33
|
||||
# via langchain
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.0.318
|
||||
langchain==0.0.320
|
||||
# via -r requirements/embed-huggingface.in
|
||||
langsmith==0.0.46
|
||||
langsmith==0.0.49
|
||||
# via langchain
|
||||
markupsafe==2.1.3
|
||||
# via jinja2
|
||||
|
||||
@ -6,7 +6,7 @@ pdf2image
|
||||
pdfminer.six
|
||||
# Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
|
||||
# when unstructured library is.
|
||||
unstructured-inference==0.7.9
|
||||
unstructured-inference==0.7.10
|
||||
# unstructured fork of pytesseract that provides an interface to allow for multiple output formats
|
||||
# from one tesseract call
|
||||
unstructured.pytesseract>=0.3.12
|
||||
|
||||
@ -203,7 +203,7 @@ sympy==1.12
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
timm==0.9.7
|
||||
timm==0.9.8
|
||||
# via effdet
|
||||
tokenizers==0.14.1
|
||||
# via transformers
|
||||
@ -236,7 +236,7 @@ typing-extensions==4.8.0
|
||||
# torch
|
||||
tzdata==2023.3
|
||||
# via pandas
|
||||
unstructured-inference==0.7.9
|
||||
unstructured-inference==0.7.10
|
||||
# via -r requirements/extra-pdf-image.in
|
||||
unstructured-pytesseract==0.3.12
|
||||
# via
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#
|
||||
azure-common==1.1.28
|
||||
# via azure-search-documents
|
||||
azure-core==1.29.4
|
||||
azure-core==1.29.5
|
||||
# via
|
||||
# azure-search-documents
|
||||
# msrest
|
||||
|
||||
@ -14,7 +14,7 @@ async-timeout==4.0.3
|
||||
# via aiohttp
|
||||
attrs==23.1.0
|
||||
# via aiohttp
|
||||
azure-core==1.29.4
|
||||
azure-core==1.29.5
|
||||
# via
|
||||
# adlfs
|
||||
# azure-identity
|
||||
|
||||
@ -61,9 +61,9 @@ jsonpatch==1.33
|
||||
# via langchain
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.0.318
|
||||
langchain==0.0.320
|
||||
# via -r requirements/ingest-bedrock.in
|
||||
langsmith==0.0.46
|
||||
langsmith==0.0.49
|
||||
# via langchain
|
||||
marshmallow==3.20.1
|
||||
# via
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-confluence.in
|
||||
#
|
||||
atlassian-python-api==3.41.2
|
||||
atlassian-python-api==3.41.3
|
||||
# via -r requirements/ingest-confluence.in
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-jira.in
|
||||
#
|
||||
atlassian-python-api==3.41.2
|
||||
atlassian-python-api==3.41.3
|
||||
# via -r requirements/ingest-jira.in
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
|
||||
@ -50,9 +50,9 @@ jsonpatch==1.33
|
||||
# via langchain
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.0.318
|
||||
langchain==0.0.320
|
||||
# via -r requirements/ingest-openai.in
|
||||
langsmith==0.0.46
|
||||
langsmith==0.0.49
|
||||
# via langchain
|
||||
marshmallow==3.20.1
|
||||
# via
|
||||
|
||||
@ -103,7 +103,7 @@ requests==2.31.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# label-studio-sdk
|
||||
ruff==0.1.0
|
||||
ruff==0.1.1
|
||||
# via -r requirements/test.in
|
||||
six==1.16.0
|
||||
# via
|
||||
|
||||
@ -180,8 +180,11 @@ def test_partition_pdf_with_model_name_env_var(
|
||||
mock_process.assert_called_once_with(
|
||||
filename,
|
||||
is_image=False,
|
||||
pdf_image_dpi=200,
|
||||
pdf_image_dpi=mock.ANY,
|
||||
extract_tables=mock.ANY,
|
||||
model_name="checkbox",
|
||||
extract_images_in_pdf=mock.ANY,
|
||||
image_output_dir_path=mock.ANY,
|
||||
)
|
||||
|
||||
|
||||
@ -199,8 +202,11 @@ def test_partition_pdf_with_model_name(
|
||||
mock_process.assert_called_once_with(
|
||||
filename,
|
||||
is_image=False,
|
||||
pdf_image_dpi=200,
|
||||
pdf_image_dpi=mock.ANY,
|
||||
extract_tables=mock.ANY,
|
||||
model_name="checkbox",
|
||||
extract_images_in_pdf=mock.ANY,
|
||||
image_output_dir_path=mock.ANY,
|
||||
)
|
||||
|
||||
|
||||
@ -440,8 +446,11 @@ def test_partition_pdf_with_dpi():
|
||||
mock_process.assert_called_once_with(
|
||||
filename,
|
||||
is_image=False,
|
||||
extract_tables=mock.ANY,
|
||||
model_name=pdf.default_hi_res_model(),
|
||||
pdf_image_dpi=100,
|
||||
extract_images_in_pdf=mock.ANY,
|
||||
image_output_dir_path=mock.ANY,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -3,7 +3,7 @@ import os
|
||||
import pathlib
|
||||
import warnings
|
||||
from importlib import import_module
|
||||
from unittest.mock import patch
|
||||
from unittest.mock import ANY, patch
|
||||
|
||||
import docx
|
||||
import pytest
|
||||
@ -347,6 +347,8 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
|
||||
url=None,
|
||||
include_page_breaks=False,
|
||||
infer_table_structure=False,
|
||||
extract_images_in_pdf=ANY,
|
||||
image_output_dir_path=ANY,
|
||||
strategy="fast",
|
||||
languages=None,
|
||||
)
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.10.25" # pragma: no cover
|
||||
__version__ = "0.10.26-dev0" # pragma: no cover
|
||||
|
||||
@ -135,6 +135,8 @@ def partition(
|
||||
languages: Optional[List[str]] = None,
|
||||
detect_language_per_element: bool = False,
|
||||
pdf_infer_table_structure: bool = False,
|
||||
pdf_extract_images: bool = False,
|
||||
pdf_image_output_dir_path: Optional[str] = None,
|
||||
xml_keep_tags: bool = False,
|
||||
data_source_metadata: Optional[DataSourceMetadata] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
@ -186,6 +188,12 @@ def partition(
|
||||
additional metadata field, "text_as_html," where the value (string) is a just a
|
||||
transformation of the data into an HTML <table>.
|
||||
The "text" field for a partitioned Table Element is always present, whether True or False.
|
||||
pdf_extract_images
|
||||
If True and strategy=hi_res, any detected images will be saved in the path specified by
|
||||
pdf_image_output_dir_path.
|
||||
pdf_image_output_dir_path
|
||||
If pdf_extract_images=True and strategy=hi_res, any detected images will be saved in the
|
||||
given path
|
||||
xml_keep_tags
|
||||
If True, will retain the XML tags in the output. Otherwise it will simply extract
|
||||
the text from within the tags. Only applies to partition_xml.
|
||||
@ -367,6 +375,8 @@ def partition(
|
||||
infer_table_structure=infer_table_structure,
|
||||
strategy=strategy,
|
||||
languages=languages,
|
||||
extract_images_in_pdf=pdf_extract_images,
|
||||
image_output_dir_path=pdf_image_output_dir_path,
|
||||
**kwargs,
|
||||
)
|
||||
elif (filetype == FileType.PNG) or (filetype == FileType.JPG) or (filetype == FileType.TIFF):
|
||||
|
||||
@ -35,10 +35,7 @@ from unstructured.documents.elements import (
|
||||
)
|
||||
from unstructured.logger import logger
|
||||
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
|
||||
from unstructured.partition.utils.constants import (
|
||||
SORT_MODE_DONT,
|
||||
SORT_MODE_XY_CUT,
|
||||
)
|
||||
from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT
|
||||
from unstructured.utils import dependency_exists, first
|
||||
|
||||
if dependency_exists("docx") and dependency_exists("docx.table"):
|
||||
@ -551,11 +548,11 @@ def document_to_element_list(
|
||||
infer_list_items: bool = True,
|
||||
source_format: Optional[str] = None,
|
||||
detection_origin: Optional[str] = None,
|
||||
sort_mode: str = SORT_MODE_XY_CUT,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Converts a DocumentLayout object to a list of unstructured elements."""
|
||||
elements: List[Element] = []
|
||||
sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)
|
||||
|
||||
num_pages = len(document.pages)
|
||||
for i, page in enumerate(document.pages):
|
||||
|
||||
@ -100,6 +100,8 @@ def partition_pdf(
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
chunking_strategy: Optional[str] = None,
|
||||
links: Sequence[Link] = [],
|
||||
extract_images_in_pdf: bool = False,
|
||||
image_output_dir_path: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Parses a pdf document into a list of interpreted elements.
|
||||
@ -135,6 +137,12 @@ def partition_pdf(
|
||||
processing text/plain content.
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
extract_images_in_pdf
|
||||
If True and strategy=hi_res, any detected images will be saved in the path specified by
|
||||
image_output_dir_path.
|
||||
image_output_dir_path
|
||||
If extract_images_in_pdf=True and strategy=hi_res, any detected images will be saved in the
|
||||
given path
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
@ -164,6 +172,8 @@ def partition_pdf(
|
||||
max_partition=max_partition,
|
||||
min_partition=min_partition,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
image_output_dir_path=image_output_dir_path,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -210,6 +220,8 @@ def partition_pdf_or_image(
|
||||
max_partition: Optional[int] = 1500,
|
||||
min_partition: Optional[int] = 0,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
extract_images_in_pdf: bool = False,
|
||||
image_output_dir_path: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Parses a pdf or image document into a list of interpreted elements."""
|
||||
@ -292,6 +304,8 @@ def partition_pdf_or_image(
|
||||
include_page_breaks=include_page_breaks,
|
||||
languages=languages,
|
||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
image_output_dir_path=image_output_dir_path,
|
||||
**kwargs,
|
||||
)
|
||||
layout_elements = []
|
||||
@ -334,6 +348,9 @@ def _partition_pdf_or_image_local(
|
||||
ocr_mode: str = OCRMode.FULL_PAGE.value,
|
||||
model_name: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
extract_images_in_pdf: bool = False,
|
||||
image_output_dir_path: Optional[str] = None,
|
||||
pdf_image_dpi: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partition using package installed locally."""
|
||||
@ -350,7 +367,6 @@ def _partition_pdf_or_image_local(
|
||||
ocr_languages = prepare_languages_for_tesseract(languages)
|
||||
|
||||
model_name = model_name or default_hi_res_model()
|
||||
pdf_image_dpi = kwargs.pop("pdf_image_dpi", None)
|
||||
if pdf_image_dpi is None:
|
||||
pdf_image_dpi = 300 if model_name == "chipper" else 200
|
||||
if (pdf_image_dpi < 300) and (model_name == "chipper"):
|
||||
@ -359,27 +375,16 @@ def _partition_pdf_or_image_local(
|
||||
f"(currently {pdf_image_dpi}).",
|
||||
)
|
||||
|
||||
# NOTE(christine): Need to extract images from PDF's
|
||||
extract_images_in_pdf = kwargs.get("extract_images_in_pdf", False)
|
||||
image_output_dir_path = kwargs.get("image_output_dir_path", None)
|
||||
process_with_model_extra_kwargs = {
|
||||
"extract_images_in_pdf": extract_images_in_pdf,
|
||||
"image_output_dir_path": image_output_dir_path,
|
||||
}
|
||||
|
||||
process_with_model_kwargs = {}
|
||||
for key, value in process_with_model_extra_kwargs.items():
|
||||
if value:
|
||||
process_with_model_kwargs[key] = value
|
||||
|
||||
if file is None:
|
||||
# NOTE(christine): out_layout = extracted_layout + inferred_layout
|
||||
out_layout = process_file_with_model(
|
||||
filename,
|
||||
is_image=is_image,
|
||||
extract_tables=infer_table_structure,
|
||||
model_name=model_name,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
**process_with_model_kwargs,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
image_output_dir_path=image_output_dir_path,
|
||||
)
|
||||
if model_name.startswith("chipper"):
|
||||
# NOTE(alan): We shouldn't do OCR with chipper
|
||||
@ -398,9 +403,11 @@ def _partition_pdf_or_image_local(
|
||||
out_layout = process_data_with_model(
|
||||
file,
|
||||
is_image=is_image,
|
||||
extract_tables=infer_table_structure,
|
||||
model_name=model_name,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
**process_with_model_kwargs,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
image_output_dir_path=image_output_dir_path,
|
||||
)
|
||||
if model_name.startswith("chipper"):
|
||||
# NOTE(alan): We shouldn't do OCR with chipper
|
||||
@ -528,11 +535,11 @@ def _process_pdfminer_pages(
|
||||
filename: str = "",
|
||||
include_page_breaks: bool = False,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
sort_mode: str = SORT_MODE_XY_CUT,
|
||||
**kwargs,
|
||||
):
|
||||
"""Uses PDF miner to split a document into pages and process them."""
|
||||
elements: List[Element] = []
|
||||
sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)
|
||||
|
||||
rsrcmgr = PDFResourceManager()
|
||||
laparams = LAParams()
|
||||
|
||||
@ -11,6 +11,7 @@ SORT_MODE_XY_CUT = "xy-cut"
|
||||
SORT_MODE_BASIC = "basic"
|
||||
SORT_MODE_DONT = "dont"
|
||||
|
||||
|
||||
SUBREGION_THRESHOLD_FOR_OCR = 0.5
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)
|
||||
|
||||
|
||||
@ -5,10 +5,7 @@ import numpy as np
|
||||
|
||||
from unstructured.documents.elements import CoordinatesMetadata, Element
|
||||
from unstructured.logger import trace_logger
|
||||
from unstructured.partition.utils.constants import (
|
||||
SORT_MODE_BASIC,
|
||||
SORT_MODE_XY_CUT,
|
||||
)
|
||||
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
|
||||
from unstructured.partition.utils.xycut import recursive_xy_cut, recursive_xy_cut_swapped
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user