From f87731e085347fb50bb4960842a24ac748262e10 Mon Sep 17 00:00:00 2001 From: Yao You Date: Fri, 27 Oct 2023 15:37:45 -0500 Subject: [PATCH] feat: use yolox as default to table extraction for pdf/image (#1919) - yolox has better recall than yolox_quantized, the current default model, for table detection - update logic so that when `infer_table_structure=True` the default model is `yolox` instead of `yolox_quantized` - user can still override the default by passing in a `model_name` or set the env variable `UNSTRUCTURED_HI_RES_MODEL_NAME` ## Test: Partition the attached file with ```python from unstructured.partition.pdf import partition_pdf yolox_elements = partition_pdf(filename, strategy="hi_re", infer_table_structure=True) yolox_quantized_elements = partition_pdf(filename, strategy="hi_re", infer_table_structure=True, model_name="yolox_quantized") ``` Compare the table elements between those two and yolox (default) elements should have more complete table. [AK_AK-PERS_CAFR_2008_3.pdf](https://github.com/Unstructured-IO/unstructured/files/13191198/AK_AK-PERS_CAFR_2008_3.pdf) --- CHANGELOG.md | 3 ++- test_unstructured/partition/pdf_image/test_pdf.py | 15 +++++++++++++++ unstructured/__version__.py | 2 +- unstructured/partition/pdf.py | 9 ++++++--- 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fe0253ac..df7d3cda6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,10 @@ -## 0.10.28-dev3 +## 0.10.28-dev4 ### Enhancements * **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance. * **Add table structure evaluation helpers** Adds functions to evaluate the similarity between predicted table structure and actual table structure. +* **Use `yolox` by default for table extraction when partitioning pdf/image** `yolox` model provides higher recall of the table regions than the quantized version and it is now the default element detection model when `infer_table_structure=True` for partitioning pdf/image files ### Features diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index cf082ab5c..ad2903ec4 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1045,6 +1045,21 @@ def test_chipper_not_losing_parents(chipper_results, chipper_children): ) +@pytest.mark.parametrize( + ("infer_table_structure", "env", "expected"), + [ + (False, None, "yolox_quantized"), + (True, None, "yolox"), + (False, "test", "test"), + (True, "test", "test"), + ], +) +def test_default_hi_res_model(infer_table_structure, env, expected, monkeypatch): + if env is not None: + monkeypatch.setenv("UNSTRUCTURED_HI_RES_MODEL_NAME", env) + assert pdf.default_hi_res_model(infer_table_structure) == expected + + def test_partition_model_name_default_to_None(): filename = "example-docs/DA-1p.pdf" try: diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 874da9ed2..606e14436 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.28-dev3" # pragma: no cover +__version__ = "0.10.28-dev4" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 8b023b564..89b1ee777 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -75,11 +75,14 @@ from unstructured.utils import requires_dependencies RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL) -def default_hi_res_model() -> str: +def default_hi_res_model(infer_table_structure: bool) -> str: # a light config for the hi res model; this is not defined as a constant so that no setting of # the default hi res model name is done on importing of this submodule; this allows (if user # prefers) for setting env after importing the sub module and changing the default model name - return os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME", "yolox_quantized") + + # if tabler structure is needed we defaul to use yolox for better table detection + default = "yolox" if infer_table_structure else "yolox_quantized" + return os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME", default) @process_metadata() @@ -366,7 +369,7 @@ def _partition_pdf_or_image_local( ocr_languages = prepare_languages_for_tesseract(languages) - model_name = model_name or default_hi_res_model() + model_name = model_name or default_hi_res_model(infer_table_structure) if pdf_image_dpi is None: pdf_image_dpi = 300 if model_name == "chipper" else 200 if (pdf_image_dpi < 300) and (model_name == "chipper"):