feat: use yolox as default to table extraction for pdf/image (#1919)

- yolox has better recall than yolox_quantized, the current default model, for table detection - update logic so that when `infer_table_structure=True` the default model is `yolox` instead of `yolox_quantized` - user can still override the default by passing in a `model_name` or set the env variable `UNSTRUCTURED_HI_RES_MODEL_NAME` ## Test: Partition the attached file with ```python from unstructured.partition.pdf import partition_pdf yolox_elements = partition_pdf(filename, strategy="hi_re", infer_table_structure=True) yolox_quantized_elements = partition_pdf(filename, strategy="hi_re", infer_table_structure=True, model_name="yolox_quantized") ``` Compare the table elements between those two and yolox (default) elements should have more complete table. [AK_AK-PERS_CAFR_2008_3.pdf](https://github.com/Unstructured-IO/unstructured/files/13191198/AK_AK-PERS_CAFR_2008_3.pdf)
2025-12-12 15:42:19 +00:00 · 2023-10-27 15:37:45 -05:00 · 2023-10-27 15:37:45 -05:00 · f87731e085
commit f87731e085
parent ff752e88df
4 changed files with 24 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,9 +1,10 @@
-## 0.10.28-dev3
+## 0.10.28-dev4

 ### Enhancements

 * **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
 * **Add table structure evaluation helpers** Adds functions to evaluate the similarity between predicted table structure and actual table structure.
+* **Use `yolox` by default for table extraction when partitioning pdf/image** `yolox` model provides higher recall of the table regions than the quantized version and it is now the default element detection model when `infer_table_structure=True` for partitioning pdf/image files

 ### Features

--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@ -1045,6 +1045,21 @@ def test_chipper_not_losing_parents(chipper_results, chipper_children):
    )


+@pytest.mark.parametrize(
+    ("infer_table_structure", "env", "expected"),
+    [
+        (False, None, "yolox_quantized"),
+        (True, None, "yolox"),
+        (False, "test", "test"),
+        (True, "test", "test"),
+    ],
+)
+def test_default_hi_res_model(infer_table_structure, env, expected, monkeypatch):
+    if env is not None:
+        monkeypatch.setenv("UNSTRUCTURED_HI_RES_MODEL_NAME", env)
+    assert pdf.default_hi_res_model(infer_table_structure) == expected
+
+
 def test_partition_model_name_default_to_None():
    filename = "example-docs/DA-1p.pdf"
    try:
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.10.28-dev3"  # pragma: no cover
+__version__ = "0.10.28-dev4"  # pragma: no cover
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -75,11 +75,14 @@ from unstructured.utils import requires_dependencies
 RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)


-def default_hi_res_model() -> str:
+def default_hi_res_model(infer_table_structure: bool) -> str:
    # a light config for the hi res model; this is not defined as a constant so that no setting of
    # the default hi res model name is done on importing of this submodule; this allows (if user
    # prefers) for setting env after importing the sub module and changing the default model name
-    return os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME", "yolox_quantized")
+
+    # if tabler structure is needed we defaul to use yolox for better table detection
+    default = "yolox" if infer_table_structure else "yolox_quantized"
+    return os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME", default)


@process_metadata()
@ -366,7 +369,7 @@ def _partition_pdf_or_image_local(

    ocr_languages = prepare_languages_for_tesseract(languages)

-    model_name = model_name or default_hi_res_model()
+    model_name = model_name or default_hi_res_model(infer_table_structure)
    if pdf_image_dpi is None:
        pdf_image_dpi = 300 if model_name == "chipper" else 200
    if (pdf_image_dpi < 300) and (model_name == "chipper"):