mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-12 15:42:19 +00:00
feat: use yolox as default to table extraction for pdf/image (#1919)
- yolox has better recall than yolox_quantized, the current default model, for table detection - update logic so that when `infer_table_structure=True` the default model is `yolox` instead of `yolox_quantized` - user can still override the default by passing in a `model_name` or set the env variable `UNSTRUCTURED_HI_RES_MODEL_NAME` ## Test: Partition the attached file with ```python from unstructured.partition.pdf import partition_pdf yolox_elements = partition_pdf(filename, strategy="hi_re", infer_table_structure=True) yolox_quantized_elements = partition_pdf(filename, strategy="hi_re", infer_table_structure=True, model_name="yolox_quantized") ``` Compare the table elements between those two and yolox (default) elements should have more complete table. [AK_AK-PERS_CAFR_2008_3.pdf](https://github.com/Unstructured-IO/unstructured/files/13191198/AK_AK-PERS_CAFR_2008_3.pdf)
This commit is contained in:
parent
ff752e88df
commit
f87731e085
@ -1,9 +1,10 @@
|
||||
## 0.10.28-dev3
|
||||
## 0.10.28-dev4
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
|
||||
* **Add table structure evaluation helpers** Adds functions to evaluate the similarity between predicted table structure and actual table structure.
|
||||
* **Use `yolox` by default for table extraction when partitioning pdf/image** `yolox` model provides higher recall of the table regions than the quantized version and it is now the default element detection model when `infer_table_structure=True` for partitioning pdf/image files
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
@ -1045,6 +1045,21 @@ def test_chipper_not_losing_parents(chipper_results, chipper_children):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("infer_table_structure", "env", "expected"),
|
||||
[
|
||||
(False, None, "yolox_quantized"),
|
||||
(True, None, "yolox"),
|
||||
(False, "test", "test"),
|
||||
(True, "test", "test"),
|
||||
],
|
||||
)
|
||||
def test_default_hi_res_model(infer_table_structure, env, expected, monkeypatch):
|
||||
if env is not None:
|
||||
monkeypatch.setenv("UNSTRUCTURED_HI_RES_MODEL_NAME", env)
|
||||
assert pdf.default_hi_res_model(infer_table_structure) == expected
|
||||
|
||||
|
||||
def test_partition_model_name_default_to_None():
|
||||
filename = "example-docs/DA-1p.pdf"
|
||||
try:
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.10.28-dev3" # pragma: no cover
|
||||
__version__ = "0.10.28-dev4" # pragma: no cover
|
||||
|
||||
@ -75,11 +75,14 @@ from unstructured.utils import requires_dependencies
|
||||
RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
|
||||
|
||||
|
||||
def default_hi_res_model() -> str:
|
||||
def default_hi_res_model(infer_table_structure: bool) -> str:
|
||||
# a light config for the hi res model; this is not defined as a constant so that no setting of
|
||||
# the default hi res model name is done on importing of this submodule; this allows (if user
|
||||
# prefers) for setting env after importing the sub module and changing the default model name
|
||||
return os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME", "yolox_quantized")
|
||||
|
||||
# if tabler structure is needed we defaul to use yolox for better table detection
|
||||
default = "yolox" if infer_table_structure else "yolox_quantized"
|
||||
return os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME", default)
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@ -366,7 +369,7 @@ def _partition_pdf_or_image_local(
|
||||
|
||||
ocr_languages = prepare_languages_for_tesseract(languages)
|
||||
|
||||
model_name = model_name or default_hi_res_model()
|
||||
model_name = model_name or default_hi_res_model(infer_table_structure)
|
||||
if pdf_image_dpi is None:
|
||||
pdf_image_dpi = 300 if model_name == "chipper" else 200
|
||||
if (pdf_image_dpi < 300) and (model_name == "chipper"):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user