feat: use yolox as default to table extraction for pdf/image (#1919)

- yolox has better recall than yolox_quantized, the current default
model, for table detection
- update logic so that when `infer_table_structure=True` the default
model is `yolox` instead of `yolox_quantized`
- user can still override the default by passing in a `model_name` or
set the env variable `UNSTRUCTURED_HI_RES_MODEL_NAME`

## Test:

Partition the attached file with 

```python
from unstructured.partition.pdf import partition_pdf

yolox_elements = partition_pdf(filename, strategy="hi_re", infer_table_structure=True)
yolox_quantized_elements = partition_pdf(filename, strategy="hi_re", infer_table_structure=True, model_name="yolox_quantized")
```

Compare the table elements between those two and yolox (default)
elements should have more complete table.


[AK_AK-PERS_CAFR_2008_3.pdf](https://github.com/Unstructured-IO/unstructured/files/13191198/AK_AK-PERS_CAFR_2008_3.pdf)
This commit is contained in:
Yao You 2023-10-27 15:37:45 -05:00 committed by GitHub
parent ff752e88df
commit f87731e085
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 24 additions and 5 deletions

View File

@ -1,9 +1,10 @@
## 0.10.28-dev3
## 0.10.28-dev4
### Enhancements
* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
* **Add table structure evaluation helpers** Adds functions to evaluate the similarity between predicted table structure and actual table structure.
* **Use `yolox` by default for table extraction when partitioning pdf/image** `yolox` model provides higher recall of the table regions than the quantized version and it is now the default element detection model when `infer_table_structure=True` for partitioning pdf/image files
### Features

View File

@ -1045,6 +1045,21 @@ def test_chipper_not_losing_parents(chipper_results, chipper_children):
)
@pytest.mark.parametrize(
("infer_table_structure", "env", "expected"),
[
(False, None, "yolox_quantized"),
(True, None, "yolox"),
(False, "test", "test"),
(True, "test", "test"),
],
)
def test_default_hi_res_model(infer_table_structure, env, expected, monkeypatch):
if env is not None:
monkeypatch.setenv("UNSTRUCTURED_HI_RES_MODEL_NAME", env)
assert pdf.default_hi_res_model(infer_table_structure) == expected
def test_partition_model_name_default_to_None():
filename = "example-docs/DA-1p.pdf"
try:

View File

@ -1 +1 @@
__version__ = "0.10.28-dev3" # pragma: no cover
__version__ = "0.10.28-dev4" # pragma: no cover

View File

@ -75,11 +75,14 @@ from unstructured.utils import requires_dependencies
RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
def default_hi_res_model() -> str:
def default_hi_res_model(infer_table_structure: bool) -> str:
# a light config for the hi res model; this is not defined as a constant so that no setting of
# the default hi res model name is done on importing of this submodule; this allows (if user
# prefers) for setting env after importing the sub module and changing the default model name
return os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME", "yolox_quantized")
# if tabler structure is needed we defaul to use yolox for better table detection
default = "yolox" if infer_table_structure else "yolox_quantized"
return os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME", default)
@process_metadata()
@ -366,7 +369,7 @@ def _partition_pdf_or_image_local(
ocr_languages = prepare_languages_for_tesseract(languages)
model_name = model_name or default_hi_res_model()
model_name = model_name or default_hi_res_model(infer_table_structure)
if pdf_image_dpi is None:
pdf_image_dpi = 300 if model_name == "chipper" else 200
if (pdf_image_dpi < 300) and (model_name == "chipper"):