From 609a08a95f930e45761fd50e152d8ef01502e96d Mon Sep 17 00:00:00 2001 From: Pluto Date: Mon, 8 Jul 2024 18:59:53 +0200 Subject: [PATCH] remove unused _with_spans metric (#3342) The table metrics considering spans is not used and it messes with the output thus I have cleaned the code from it. Though, I have left table_as_cells in the source code - it still may be useful for the users --- CHANGELOG.md | 3 ++- test_unstructured/metrics/test_evaluate.py | 4 +-- unstructured/__version__.py | 2 +- unstructured/metrics/evaluate.py | 31 +++++----------------- 4 files changed, 12 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6474ac9a0..ed7ae6d76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.10-dev11 +## 0.14.10-dev12 ### Enhancements @@ -6,6 +6,7 @@ greater than min version and updated tests that were failing given the update. * **`.doc` files are now supported in the `arm64` image.**. `libreoffice24` is added to the `arm64` image, meaning `.doc` files are now supported. We have follow on work planned to investigate adding `.ppt` support for `arm64` as well. * Add table detection metrics: recall, precision and f1 +* Remove unused _with_spans metrics ### Features diff --git a/test_unstructured/metrics/test_evaluate.py b/test_unstructured/metrics/test_evaluate.py index 79fa01be9..dae5dfa32 100644 --- a/test_unstructured/metrics/test_evaluate.py +++ b/test_unstructured/metrics/test_evaluate.py @@ -115,7 +115,7 @@ def test_text_extraction_evaluation(): UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME, GOLD_TABLE_STRUCTURE_DIRNAME, Path("IRS-2023-Form-1095-A.pdf.json"), - 23, + 13, {}, ), ( @@ -191,7 +191,7 @@ def test_table_structure_evaluation(): assert os.path.isfile(os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv")) df = pd.read_csv(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"), sep="\t") assert len(df) == 1 - assert len(df.columns) == 23 + assert len(df.columns) == 13 assert df.iloc[0].filename == "IRS-2023-Form-1095-A.pdf" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 1516c1e9a..b0e2b8e81 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.10-dev11" # pragma: no cover +__version__ = "0.14.10-dev12" # pragma: no cover diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py index 01b4fb56a..566fa23b1 100755 --- a/unstructured/metrics/evaluate.py +++ b/unstructured/metrics/evaluate.py @@ -229,35 +229,18 @@ class TableStructureMetricsCalculator(BaseMetricsCalculator): source_type="html", ) report_from_html = processor_from_text_as_html.process_file() - - processor_from_table_as_cells = TableEvalProcessor.from_json_files( - prediction_file=prediction_file, - ground_truth_file=ground_truth_file, - cutoff=self.cutoff, - source_type="cells", - ) - report_from_cells = processor_from_table_as_cells.process_file() - return ( - [ - out_filename, - doctype, - connector, - ] - + [getattr(report_from_html, metric) for metric in self.supported_metric_names] - + [getattr(report_from_cells, metric) for metric in self.supported_metric_names] - ) + return [ + out_filename, + doctype, + connector, + ] + [getattr(report_from_html, metric) for metric in self.supported_metric_names] def _generate_dataframes(self, rows): - # NOTE(mike): this logic should be simplified - suffixed_table_eval_metrics = [ - f"{metric}_with_spans" for metric in self.supported_metric_names - ] - combined_table_metrics = self.supported_metric_names + suffixed_table_eval_metrics headers = [ "filename", "doctype", "connector", - ] + combined_table_metrics + ] + self.supported_metric_names df = pd.DataFrame(rows, columns=headers) has_tables_df = df[df["total_tables"] > 0] @@ -268,7 +251,7 @@ class TableStructureMetricsCalculator(BaseMetricsCalculator): ).reset_index() else: element_metrics_results = {} - for metric in combined_table_metrics: + for metric in self.supported_metric_names: metric_df = has_tables_df[has_tables_df[metric].notnull()] agg_metric = metric_df[metric].agg([_mean, _stdev, _pstdev, _count]).transpose() if agg_metric.empty: