mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-14 00:29:47 +00:00
feat: use yolox as default to table extraction for pdf/image (#1919)
- yolox has better recall than yolox_quantized, the current default model, for table detection - update logic so that when `infer_table_structure=True` the default model is `yolox` instead of `yolox_quantized` - user can still override the default by passing in a `model_name` or set the env variable `UNSTRUCTURED_HI_RES_MODEL_NAME` ## Test: Partition the attached file with ```python from unstructured.partition.pdf import partition_pdf yolox_elements = partition_pdf(filename, strategy="hi_re", infer_table_structure=True) yolox_quantized_elements = partition_pdf(filename, strategy="hi_re", infer_table_structure=True, model_name="yolox_quantized") ``` Compare the table elements between those two and yolox (default) elements should have more complete table. [AK_AK-PERS_CAFR_2008_3.pdf](https://github.com/Unstructured-IO/unstructured/files/13191198/AK_AK-PERS_CAFR_2008_3.pdf)
This commit is contained in:
parent
ff752e88df
commit
f87731e085
@ -1,9 +1,10 @@
|
|||||||
## 0.10.28-dev3
|
## 0.10.28-dev4
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
|
* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
|
||||||
* **Add table structure evaluation helpers** Adds functions to evaluate the similarity between predicted table structure and actual table structure.
|
* **Add table structure evaluation helpers** Adds functions to evaluate the similarity between predicted table structure and actual table structure.
|
||||||
|
* **Use `yolox` by default for table extraction when partitioning pdf/image** `yolox` model provides higher recall of the table regions than the quantized version and it is now the default element detection model when `infer_table_structure=True` for partitioning pdf/image files
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
|||||||
@ -1045,6 +1045,21 @@ def test_chipper_not_losing_parents(chipper_results, chipper_children):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("infer_table_structure", "env", "expected"),
|
||||||
|
[
|
||||||
|
(False, None, "yolox_quantized"),
|
||||||
|
(True, None, "yolox"),
|
||||||
|
(False, "test", "test"),
|
||||||
|
(True, "test", "test"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_default_hi_res_model(infer_table_structure, env, expected, monkeypatch):
|
||||||
|
if env is not None:
|
||||||
|
monkeypatch.setenv("UNSTRUCTURED_HI_RES_MODEL_NAME", env)
|
||||||
|
assert pdf.default_hi_res_model(infer_table_structure) == expected
|
||||||
|
|
||||||
|
|
||||||
def test_partition_model_name_default_to_None():
|
def test_partition_model_name_default_to_None():
|
||||||
filename = "example-docs/DA-1p.pdf"
|
filename = "example-docs/DA-1p.pdf"
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.10.28-dev3" # pragma: no cover
|
__version__ = "0.10.28-dev4" # pragma: no cover
|
||||||
|
|||||||
@ -75,11 +75,14 @@ from unstructured.utils import requires_dependencies
|
|||||||
RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
|
RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
|
||||||
|
|
||||||
|
|
||||||
def default_hi_res_model() -> str:
|
def default_hi_res_model(infer_table_structure: bool) -> str:
|
||||||
# a light config for the hi res model; this is not defined as a constant so that no setting of
|
# a light config for the hi res model; this is not defined as a constant so that no setting of
|
||||||
# the default hi res model name is done on importing of this submodule; this allows (if user
|
# the default hi res model name is done on importing of this submodule; this allows (if user
|
||||||
# prefers) for setting env after importing the sub module and changing the default model name
|
# prefers) for setting env after importing the sub module and changing the default model name
|
||||||
return os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME", "yolox_quantized")
|
|
||||||
|
# if tabler structure is needed we defaul to use yolox for better table detection
|
||||||
|
default = "yolox" if infer_table_structure else "yolox_quantized"
|
||||||
|
return os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME", default)
|
||||||
|
|
||||||
|
|
||||||
@process_metadata()
|
@process_metadata()
|
||||||
@ -366,7 +369,7 @@ def _partition_pdf_or_image_local(
|
|||||||
|
|
||||||
ocr_languages = prepare_languages_for_tesseract(languages)
|
ocr_languages = prepare_languages_for_tesseract(languages)
|
||||||
|
|
||||||
model_name = model_name or default_hi_res_model()
|
model_name = model_name or default_hi_res_model(infer_table_structure)
|
||||||
if pdf_image_dpi is None:
|
if pdf_image_dpi is None:
|
||||||
pdf_image_dpi = 300 if model_name == "chipper" else 200
|
pdf_image_dpi = 300 if model_name == "chipper" else 200
|
||||||
if (pdf_image_dpi < 300) and (model_name == "chipper"):
|
if (pdf_image_dpi < 300) and (model_name == "chipper"):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user