mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-27 09:25:59 +00:00
remove unused _with_spans metric (#3342)
The table metrics considering spans is not used and it messes with the output thus I have cleaned the code from it. Though, I have left table_as_cells in the source code - it still may be useful for the users
This commit is contained in:
parent
caea73c8e3
commit
609a08a95f
@ -1,4 +1,4 @@
|
|||||||
## 0.14.10-dev11
|
## 0.14.10-dev12
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -6,6 +6,7 @@
|
|||||||
greater than min version and updated tests that were failing given the update.
|
greater than min version and updated tests that were failing given the update.
|
||||||
* **`.doc` files are now supported in the `arm64` image.**. `libreoffice24` is added to the `arm64` image, meaning `.doc` files are now supported. We have follow on work planned to investigate adding `.ppt` support for `arm64` as well.
|
* **`.doc` files are now supported in the `arm64` image.**. `libreoffice24` is added to the `arm64` image, meaning `.doc` files are now supported. We have follow on work planned to investigate adding `.ppt` support for `arm64` as well.
|
||||||
* Add table detection metrics: recall, precision and f1
|
* Add table detection metrics: recall, precision and f1
|
||||||
|
* Remove unused _with_spans metrics
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
@ -115,7 +115,7 @@ def test_text_extraction_evaluation():
|
|||||||
UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME,
|
UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME,
|
||||||
GOLD_TABLE_STRUCTURE_DIRNAME,
|
GOLD_TABLE_STRUCTURE_DIRNAME,
|
||||||
Path("IRS-2023-Form-1095-A.pdf.json"),
|
Path("IRS-2023-Form-1095-A.pdf.json"),
|
||||||
23,
|
13,
|
||||||
{},
|
{},
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
@ -191,7 +191,7 @@ def test_table_structure_evaluation():
|
|||||||
assert os.path.isfile(os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv"))
|
assert os.path.isfile(os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv"))
|
||||||
df = pd.read_csv(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"), sep="\t")
|
df = pd.read_csv(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"), sep="\t")
|
||||||
assert len(df) == 1
|
assert len(df) == 1
|
||||||
assert len(df.columns) == 23
|
assert len(df.columns) == 13
|
||||||
assert df.iloc[0].filename == "IRS-2023-Form-1095-A.pdf"
|
assert df.iloc[0].filename == "IRS-2023-Form-1095-A.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.14.10-dev11" # pragma: no cover
|
__version__ = "0.14.10-dev12" # pragma: no cover
|
||||||
|
@ -229,35 +229,18 @@ class TableStructureMetricsCalculator(BaseMetricsCalculator):
|
|||||||
source_type="html",
|
source_type="html",
|
||||||
)
|
)
|
||||||
report_from_html = processor_from_text_as_html.process_file()
|
report_from_html = processor_from_text_as_html.process_file()
|
||||||
|
return [
|
||||||
processor_from_table_as_cells = TableEvalProcessor.from_json_files(
|
|
||||||
prediction_file=prediction_file,
|
|
||||||
ground_truth_file=ground_truth_file,
|
|
||||||
cutoff=self.cutoff,
|
|
||||||
source_type="cells",
|
|
||||||
)
|
|
||||||
report_from_cells = processor_from_table_as_cells.process_file()
|
|
||||||
return (
|
|
||||||
[
|
|
||||||
out_filename,
|
out_filename,
|
||||||
doctype,
|
doctype,
|
||||||
connector,
|
connector,
|
||||||
]
|
] + [getattr(report_from_html, metric) for metric in self.supported_metric_names]
|
||||||
+ [getattr(report_from_html, metric) for metric in self.supported_metric_names]
|
|
||||||
+ [getattr(report_from_cells, metric) for metric in self.supported_metric_names]
|
|
||||||
)
|
|
||||||
|
|
||||||
def _generate_dataframes(self, rows):
|
def _generate_dataframes(self, rows):
|
||||||
# NOTE(mike): this logic should be simplified
|
|
||||||
suffixed_table_eval_metrics = [
|
|
||||||
f"{metric}_with_spans" for metric in self.supported_metric_names
|
|
||||||
]
|
|
||||||
combined_table_metrics = self.supported_metric_names + suffixed_table_eval_metrics
|
|
||||||
headers = [
|
headers = [
|
||||||
"filename",
|
"filename",
|
||||||
"doctype",
|
"doctype",
|
||||||
"connector",
|
"connector",
|
||||||
] + combined_table_metrics
|
] + self.supported_metric_names
|
||||||
|
|
||||||
df = pd.DataFrame(rows, columns=headers)
|
df = pd.DataFrame(rows, columns=headers)
|
||||||
has_tables_df = df[df["total_tables"] > 0]
|
has_tables_df = df[df["total_tables"] > 0]
|
||||||
@ -268,7 +251,7 @@ class TableStructureMetricsCalculator(BaseMetricsCalculator):
|
|||||||
).reset_index()
|
).reset_index()
|
||||||
else:
|
else:
|
||||||
element_metrics_results = {}
|
element_metrics_results = {}
|
||||||
for metric in combined_table_metrics:
|
for metric in self.supported_metric_names:
|
||||||
metric_df = has_tables_df[has_tables_df[metric].notnull()]
|
metric_df = has_tables_df[has_tables_df[metric].notnull()]
|
||||||
agg_metric = metric_df[metric].agg([_mean, _stdev, _pstdev, _count]).transpose()
|
agg_metric = metric_df[metric].agg([_mean, _stdev, _pstdev, _count]).transpose()
|
||||||
if agg_metric.empty:
|
if agg_metric.empty:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user