mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-01 11:21:13 +00:00
chore: add hi_res_model_name kwarg (#2289)
Closes #2160 Explicitly adds `hi_res_model_name` as kwarg to relevant functions and notes that `model_name` is to be deprecated. Testing: ``` from unstructured.partition.auto import partition filename = "example-docs/DA-1p.pdf" elements = partition(filename, strategy="hi_res", hi_res_model_name="yolox") ``` --------- Co-authored-by: cragwolfe <crag@unstructured.io> Co-authored-by: Steve Canny <stcanny@gmail.com> Co-authored-by: Christine Straub <christinemstraub@gmail.com> Co-authored-by: Yao You <yao@unstructured.io> Co-authored-by: Yao You <theyaoyou@gmail.com>
This commit is contained in:
parent
093a11d058
commit
5c0043aa7d
@ -24,6 +24,8 @@
|
||||
### Fixes
|
||||
* **Enable --fields argument omission for elasticsearch connector** Solves two bugs where removing the optional parameter --fields broke the connector due to an integer processing error and using an elasticsearch config for a destination connector resulted in a serialization issue when optional parameter --fields was not provided.
|
||||
|
||||
* **Add hi_res_model_name** Adds kwarg to relevant functions and add comments that model_name is to be deprecated.
|
||||
|
||||
## 0.11.5
|
||||
|
||||
### Enhancements
|
||||
|
@ -536,6 +536,18 @@ def test_partition_image_uses_model_name():
|
||||
assert mockpartition.call_args.kwargs["model_name"]
|
||||
|
||||
|
||||
def test_partition_image_uses_hi_res_model_name():
|
||||
with mock.patch.object(
|
||||
pdf,
|
||||
"_partition_pdf_or_image_local",
|
||||
) as mockpartition:
|
||||
image.partition_image("example-docs/layout-parser-paper-fast.jpg", hi_res_model_name="test")
|
||||
print(mockpartition.call_args)
|
||||
assert "model_name" not in mockpartition.call_args.kwargs
|
||||
assert "hi_res_model_name" in mockpartition.call_args.kwargs
|
||||
assert mockpartition.call_args.kwargs["hi_res_model_name"] == "test"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("ocr_mode", "idx_title_element"),
|
||||
[
|
||||
|
@ -215,6 +215,40 @@ def test_partition_pdf_with_model_name(
|
||||
assert mock_process.call_args[1]["model_name"] == "checkbox"
|
||||
|
||||
|
||||
def test_partition_pdf_with_hi_res_model_name(
|
||||
monkeypatch,
|
||||
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
||||
with mock.patch.object(
|
||||
layout,
|
||||
"process_file_with_model",
|
||||
mock.MagicMock(),
|
||||
) as mock_process:
|
||||
pdf.partition_pdf(
|
||||
filename=filename, strategy=PartitionStrategy.HI_RES, hi_res_model_name="checkbox"
|
||||
)
|
||||
# unstructured-ingest uses `model_name` instead of `hi_res_model_name`
|
||||
assert mock_process.call_args[1]["model_name"] == "checkbox"
|
||||
|
||||
|
||||
def test_partition_pdf_or_image_with_hi_res_model_name(
|
||||
monkeypatch,
|
||||
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
||||
with mock.patch.object(
|
||||
layout,
|
||||
"process_file_with_model",
|
||||
mock.MagicMock(),
|
||||
) as mock_process:
|
||||
pdf.partition_pdf_or_image(
|
||||
filename=filename, strategy=PartitionStrategy.HI_RES, hi_res_model_name="checkbox"
|
||||
)
|
||||
# unstructured-ingest uses `model_name` instead of `hi_res_model_name`
|
||||
assert mock_process.call_args[1]["model_name"] == "checkbox"
|
||||
|
||||
|
||||
def test_partition_pdf_with_auto_strategy(
|
||||
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
@ -798,6 +832,22 @@ def test_partition_pdf_uses_model_name():
|
||||
assert mockpartition.call_args.kwargs["model_name"]
|
||||
|
||||
|
||||
def test_partition_pdf_uses_hi_res_model_name():
|
||||
with mock.patch.object(
|
||||
pdf,
|
||||
"_partition_pdf_or_image_local",
|
||||
) as mockpartition:
|
||||
pdf.partition_pdf(
|
||||
example_doc_path("layout-parser-paper-fast.pdf"),
|
||||
hi_res_model_name="test",
|
||||
strategy=PartitionStrategy.HI_RES,
|
||||
)
|
||||
|
||||
mockpartition.assert_called_once()
|
||||
assert "hi_res_model_name" in mockpartition.call_args.kwargs
|
||||
assert mockpartition.call_args.kwargs["hi_res_model_name"]
|
||||
|
||||
|
||||
def test_partition_pdf_word_bbox_not_char(
|
||||
filename=example_doc_path("interface-config-guide-p93.pdf"),
|
||||
):
|
||||
@ -863,6 +913,18 @@ def test_partition_model_name_default_to_None():
|
||||
pytest.fail("partition_pdf() raised AttributeError unexpectedly!")
|
||||
|
||||
|
||||
def test_partition_hi_res_model_name_default_to_None():
|
||||
filename = example_doc_path("DA-1p.pdf")
|
||||
try:
|
||||
pdf.partition_pdf(
|
||||
filename=filename,
|
||||
strategy=PartitionStrategy.HI_RES,
|
||||
hi_res_model_name=None,
|
||||
)
|
||||
except AttributeError:
|
||||
pytest.fail("partition_pdf() raised AttributeError unexpectedly!")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("strategy", "ocr_func"),
|
||||
[
|
||||
|
@ -356,6 +356,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
|
||||
image_output_dir_path=ANY,
|
||||
strategy=PartitionStrategy.FAST,
|
||||
languages=None,
|
||||
hi_res_model_name=None,
|
||||
)
|
||||
|
||||
|
||||
|
@ -142,6 +142,8 @@ def partition(
|
||||
data_source_metadata: Optional[DataSourceMetadata] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
request_timeout: Optional[int] = None,
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
model_name: Optional[str] = None, # to be deprecated
|
||||
**kwargs,
|
||||
):
|
||||
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
||||
@ -202,6 +204,11 @@ def partition(
|
||||
request_timeout
|
||||
The timeout for the HTTP request if URL is set. Defaults to None meaning no timeout and
|
||||
requests will block indefinitely.
|
||||
hi_res_model_name
|
||||
The layout detection model used when partitioning strategy is set to `hi_res`.
|
||||
model_name
|
||||
The layout detection model used when partitioning strategy is set to `hi_res`. To be
|
||||
deprecated in favor of `hi_res_model_name`.
|
||||
"""
|
||||
exactly_one(file=file, filename=filename, url=url)
|
||||
|
||||
@ -391,6 +398,7 @@ def partition(
|
||||
languages=languages,
|
||||
extract_images_in_pdf=pdf_extract_images,
|
||||
image_output_dir_path=pdf_image_output_dir_path,
|
||||
hi_res_model_name=hi_res_model_name or model_name,
|
||||
**kwargs,
|
||||
)
|
||||
elif (filetype == FileType.PNG) or (filetype == FileType.JPG) or (filetype == FileType.TIFF):
|
||||
@ -402,6 +410,7 @@ def partition(
|
||||
infer_table_structure=infer_table_structure,
|
||||
strategy=strategy,
|
||||
languages=languages,
|
||||
hi_res_model_name=hi_res_model_name or model_name,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.TXT:
|
||||
|
@ -25,6 +25,7 @@ def partition_image(
|
||||
strategy: str = PartitionStrategy.HI_RES,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
chunking_strategy: Optional[str] = None,
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Parses an image into a list of interpreted elements.
|
||||
@ -55,6 +56,8 @@ def partition_image(
|
||||
The default strategy is `hi_res`.
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
hi_res_model_name
|
||||
The layout detection model used when partitioning strategy is set to `hi_res`.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
@ -89,5 +92,6 @@ def partition_image(
|
||||
languages=languages,
|
||||
strategy=strategy,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
hi_res_model_name=hi_res_model_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
@ -143,6 +143,7 @@ def partition_pdf(
|
||||
extract_images_in_pdf: bool = False,
|
||||
extract_element_types: Optional[List[str]] = None,
|
||||
image_output_dir_path: Optional[str] = None,
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Parses a pdf document into a list of interpreted elements.
|
||||
@ -182,6 +183,8 @@ def partition_pdf(
|
||||
image_output_dir_path
|
||||
Only applicable if `strategy=hi_res`.
|
||||
The path for saving images when using `extract_images_in_pdf` or `extract_element_types`.
|
||||
hi_res_model_name
|
||||
The layout detection model used when partitioning strategy is set to `hi_res`.
|
||||
"""
|
||||
|
||||
exactly_one(filename=filename, file=file)
|
||||
@ -199,6 +202,7 @@ def partition_pdf(
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
extract_element_types=extract_element_types,
|
||||
image_output_dir_path=image_output_dir_path,
|
||||
hi_res_model_name=hi_res_model_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -244,13 +248,14 @@ def _partition_pdf_or_image_local(
|
||||
include_page_breaks: bool = False,
|
||||
languages: Optional[List[str]] = None,
|
||||
ocr_mode: str = OCRMode.FULL_PAGE.value,
|
||||
model_name: Optional[str] = None,
|
||||
model_name: Optional[str] = None, # to be deprecated in favor of `hi_res_model_name`
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
pdf_text_extractable: bool = False,
|
||||
extract_images_in_pdf: bool = False,
|
||||
extract_element_types: Optional[List[str]] = None,
|
||||
image_output_dir_path: Optional[str] = None,
|
||||
pdf_image_dpi: Optional[int] = None,
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
analysis: bool = False,
|
||||
analyzed_image_output_dir_path: Optional[str] = None,
|
||||
**kwargs,
|
||||
@ -275,10 +280,12 @@ def _partition_pdf_or_image_local(
|
||||
|
||||
ocr_languages = prepare_languages_for_tesseract(languages)
|
||||
|
||||
model_name = model_name or default_hi_res_model(infer_table_structure)
|
||||
hi_res_model_name = (
|
||||
hi_res_model_name or model_name or default_hi_res_model(infer_table_structure)
|
||||
)
|
||||
if pdf_image_dpi is None:
|
||||
pdf_image_dpi = 300 if model_name == "chipper" else 200
|
||||
if (pdf_image_dpi < 300) and (model_name == "chipper"):
|
||||
pdf_image_dpi = 300 if hi_res_model_name == "chipper" else 200
|
||||
if (pdf_image_dpi < 300) and (hi_res_model_name == "chipper"):
|
||||
logger.warning(
|
||||
"The Chipper model performs better when images are rendered with DPI >= 300 "
|
||||
f"(currently {pdf_image_dpi}).",
|
||||
@ -288,7 +295,7 @@ def _partition_pdf_or_image_local(
|
||||
inferred_document_layout = process_file_with_model(
|
||||
filename,
|
||||
is_image=is_image,
|
||||
model_name=model_name,
|
||||
model_name=hi_res_model_name,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
)
|
||||
|
||||
@ -314,7 +321,7 @@ def _partition_pdf_or_image_local(
|
||||
extracted_layout=extracted_layout,
|
||||
)
|
||||
|
||||
if model_name.startswith("chipper"):
|
||||
if hi_res_model_name.startswith("chipper"):
|
||||
# NOTE(alan): We shouldn't do OCR with chipper
|
||||
final_document_layout = merged_document_layout
|
||||
else:
|
||||
@ -331,7 +338,7 @@ def _partition_pdf_or_image_local(
|
||||
inferred_document_layout = process_data_with_model(
|
||||
file,
|
||||
is_image=is_image,
|
||||
model_name=model_name,
|
||||
model_name=hi_res_model_name,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
)
|
||||
if hasattr(file, "seek"):
|
||||
@ -347,7 +354,7 @@ def _partition_pdf_or_image_local(
|
||||
extracted_layout=extracted_layout,
|
||||
)
|
||||
|
||||
if model_name.startswith("chipper"):
|
||||
if hi_res_model_name.startswith("chipper"):
|
||||
# NOTE(alan): We shouldn't do OCR with chipper
|
||||
final_document_layout = merged_document_layout
|
||||
else:
|
||||
@ -364,7 +371,7 @@ def _partition_pdf_or_image_local(
|
||||
)
|
||||
|
||||
# NOTE(alan): starting with v2, chipper sorts the elements itself.
|
||||
if model_name == "chipper":
|
||||
if hi_res_model_name == "chipper":
|
||||
kwargs["sort_mode"] = SORT_MODE_DONT
|
||||
|
||||
final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
|
||||
@ -434,7 +441,7 @@ def _partition_pdf_or_image_local(
|
||||
).strip()
|
||||
# NOTE(alan): with chipper there are parent elements with no text we don't want to
|
||||
# filter those out and leave the children orphaned.
|
||||
if el.text or isinstance(el, PageBreak) or model_name.startswith("chipper"):
|
||||
if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"):
|
||||
out_elements.append(cast(Element, el))
|
||||
|
||||
return out_elements
|
||||
@ -453,6 +460,7 @@ def partition_pdf_or_image(
|
||||
extract_images_in_pdf: bool = False,
|
||||
extract_element_types: Optional[List[str]] = None,
|
||||
image_output_dir_path: Optional[str] = None,
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Parses a pdf or image document into a list of interpreted elements."""
|
||||
@ -514,6 +522,7 @@ def partition_pdf_or_image(
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
extract_element_types=extract_element_types,
|
||||
image_output_dir_path=image_output_dir_path,
|
||||
hi_res_model_name=hi_res_model_name,
|
||||
**kwargs,
|
||||
)
|
||||
out_elements = _process_uncategorized_text_elements(elements)
|
||||
|
Loading…
x
Reference in New Issue
Block a user