From c24e6e056c53e5bda1688c6f48339b52552988dc Mon Sep 17 00:00:00 2001 From: shreyanid <42684285+shreyanid@users.noreply.github.com> Date: Thu, 2 Nov 2023 12:15:53 -0700 Subject: [PATCH] chore: add doctype to ingest evaluation functions (#1977) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Summary To combine ingest and holistic metrics efforts, add the `doctype` field to the results from the functions in evaluate.py for use in subsequent aggregation functions. ### Test Run `sh ./test_unstructured_ingest/evaluation-metrics.sh text-extraction` and there will be a new doctype column with the file's doctype extension. Screenshot 2023-11-01 at 2 23 11 PM --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: shreyanid --- CHANGELOG.md | 3 +- .../jira-diff/1/10000.json | 105 ------------------ .../UDHR_first_article_all.txt.json | 16 --- .../metrics/aggregate-scores-cct.tsv | 2 +- .../metrics/all-docs-cct.tsv | 9 +- .../all-docs-element-type-frequency.tsv | 2 +- unstructured/__version__.py | 2 +- unstructured/ingest/evaluate.py | 12 +- 8 files changed, 18 insertions(+), 133 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f4977a2e..8d9c44c22 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.10.29-dev8 +## 0.10.29-dev9 ### Enhancements +* **Add doctype field to CI metric functions** Adds a doctype column to the ingest metric sheets for use in subsequent aggregations. * **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance. * **Separate chipper tests** Chipper tests are long-running and require special access, so the tests have been separated into their own file under their own marker, and now have a separate `make` target. * **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning. diff --git a/test_unstructured_ingest/expected-structured-output/jira-diff/1/10000.json b/test_unstructured_ingest/expected-structured-output/jira-diff/1/10000.json index 9218aea30..d03951469 100644 --- a/test_unstructured_ingest/expected-structured-output/jira-diff/1/10000.json +++ b/test_unstructured_ingest/expected-structured-output/jira-diff/1/10000.json @@ -230,27 +230,6 @@ }, "text": "!image" }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": { - "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1", - "record_locator": { - "base_url": "https://unstructured-jira-connector-test.atlassian.net", - "issue_key": "JCTP1-1" - }, - "date_created": "2023-08-22T11:29:37.774000+00:00", - "date_modified": "2023-08-24T12:05:04.690000+00:00" - }, - "filetype": "text/plain", - "languages": [ - "cat", - "eng" - ] - }, - "text": "" - }, { "type": "UncategorizedText", "element_id": "d1e0c46eef256237b23aa43bfbadb23c", @@ -272,27 +251,6 @@ }, "text": "20230823" }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": { - "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1", - "record_locator": { - "base_url": "https://unstructured-jira-connector-test.atlassian.net", - "issue_key": "JCTP1-1" - }, - "date_created": "2023-08-22T11:29:37.774000+00:00", - "date_modified": "2023-08-24T12:05:04.690000+00:00" - }, - "filetype": "text/plain", - "languages": [ - "cat", - "eng" - ] - }, - "text": "" - }, { "type": "UncategorizedText", "element_id": "69d89b316aa1ff82d60e8438f764b0cf", @@ -419,27 +377,6 @@ }, "text": "https://unstructured" }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": { - "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1", - "record_locator": { - "base_url": "https://unstructured-jira-connector-test.atlassian.net", - "issue_key": "JCTP1-1" - }, - "date_created": "2023-08-22T11:29:37.774000+00:00", - "date_modified": "2023-08-24T12:05:04.690000+00:00" - }, - "filetype": "text/plain", - "languages": [ - "cat", - "eng" - ] - }, - "text": "" - }, { "type": "Title", "element_id": "1e32847769dc4a1588004a1dfdf10041", @@ -461,27 +398,6 @@ }, "text": "jira" }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": { - "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1", - "record_locator": { - "base_url": "https://unstructured-jira-connector-test.atlassian.net", - "issue_key": "JCTP1-1" - }, - "date_created": "2023-08-22T11:29:37.774000+00:00", - "date_modified": "2023-08-24T12:05:04.690000+00:00" - }, - "filetype": "text/plain", - "languages": [ - "cat", - "eng" - ] - }, - "text": "" - }, { "type": "Title", "element_id": "c08c6acfff81cafe379f88061e6b71bf", @@ -503,27 +419,6 @@ }, "text": "connector" }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": { - "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1", - "record_locator": { - "base_url": "https://unstructured-jira-connector-test.atlassian.net", - "issue_key": "JCTP1-1" - }, - "date_created": "2023-08-22T11:29:37.774000+00:00", - "date_modified": "2023-08-24T12:05:04.690000+00:00" - }, - "filetype": "text/plain", - "languages": [ - "cat", - "eng" - ] - }, - "text": "" - }, { "type": "Title", "element_id": "d680679411e53e07cf66ab4bad00bfcd", diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json b/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json index a69276ed1..4ba614f91 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json @@ -764,22 +764,6 @@ }, "text": "Awa" }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": { - "url": "example-docs/language-docs/UDHR_first_article_all.txt", - "permissions_data": [ - { - "mode": 33188 - } - ] - }, - "filetype": "text/plain" - }, - "text": "" - }, { "type": "Title", "element_id": "9989b925ebf5c724d61091d02fab6585", diff --git a/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv b/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv index 0054e9a7e..e362972a4 100644 --- a/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv +++ b/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv @@ -1,3 +1,3 @@ strategy average sample_sd population_sd count cct-accuracy 0.798 0.083 0.072 4 -cct-%missing 0.087 0.037 0.032 4 +cct-%missing 0.089 0.04 0.035 4 diff --git a/test_unstructured_ingest/metrics/all-docs-cct.tsv b/test_unstructured_ingest/metrics/all-docs-cct.tsv index 7b714a590..69ffeaaab 100644 --- a/test_unstructured_ingest/metrics/all-docs-cct.tsv +++ b/test_unstructured_ingest/metrics/all-docs-cct.tsv @@ -1,4 +1,5 @@ -filename connector cct-accuracy cct-%missing -handbook-1p.docx box 0.974 0.03 -example-10k.html local 0.686 0.04 -IRS-form-1987.pdf azure 0.783 0.13 +filename doctype connector cct-accuracy cct-%missing +science-exploration-1p.pptx pptx dropbox 0.861 0.093 +science-exploration-1p.pptx pptx box 0.861 0.093 +example-10k.html html local 0.686 0.037 +IRS-form-1987.pdf pdf azure 0.783 0.135 diff --git a/test_unstructured_ingest/metrics/all-docs-element-type-frequency.tsv b/test_unstructured_ingest/metrics/all-docs-element-type-frequency.tsv index 50d494248..b5fc1d7d0 100644 --- a/test_unstructured_ingest/metrics/all-docs-element-type-frequency.tsv +++ b/test_unstructured_ingest/metrics/all-docs-element-type-frequency.tsv @@ -1 +1 @@ -filename connector element-type-accuracy +filename doctype connector element-type-accuracy diff --git a/unstructured/__version__.py b/unstructured/__version__.py index c95c99598..6e0192bbe 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.29-dev8" # pragma: no cover +__version__ = "0.10.29-dev9" # pragma: no cover diff --git a/unstructured/ingest/evaluate.py b/unstructured/ingest/evaluate.py index ff78b1a9a..ef8f72d17 100755 --- a/unstructured/ingest/evaluate.py +++ b/unstructured/ingest/evaluate.py @@ -98,21 +98,24 @@ def measure_text_edit_distance( accuracy_scores: List[float] = [] percent_missing_scores: List[float] = [] + # assumption: output file name convention is name-of-file.doc.json for doc in output_list: # type: ignore fn = (doc.split("/")[-1]).split(".json")[0] + doctype = fn.rsplit(".", 1)[-1] fn_txt = fn + ".txt" connector = doc.split("/")[0] + if fn_txt in source_list: # type: ignore output_cct = elements_to_text(elements_from_json(os.path.join(output_dir, doc))) source_cct = _read_text(os.path.join(source_dir, fn_txt)) accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3) percent_missing = round(calculate_percent_missing_text(output_cct, source_cct), 3) - rows.append([fn, connector, accuracy, percent_missing]) + rows.append([fn, doctype, connector, accuracy, percent_missing]) accuracy_scores.append(accuracy) percent_missing_scores.append(percent_missing) - headers = ["filename", "connector", "cct-accuracy", "cct-%missing"] + headers = ["filename", "doctype", "connector", "cct-accuracy", "cct-%missing"] _write_to_file(export_dir, "all-docs-cct.tsv", rows, headers) agg_rows = [] @@ -187,15 +190,16 @@ def measure_element_type_accuracy( for doc in output_list: # type: ignore fn = (doc.split("/")[-1]).split(".json")[0] + doctype = fn.rsplit(".", 1)[-1] connector = doc.split("/")[0] if doc in source_list: # type: ignore output = get_element_type_frequency(_read_text(os.path.join(output_dir, doc))) source = get_element_type_frequency(_read_text(os.path.join(source_dir, doc))) accuracy = round(calculate_element_type_percent_match(output, source), 3) - rows.append([fn, connector, accuracy]) + rows.append([fn, doctype, connector, accuracy]) accuracy_scores.append(accuracy) - headers = ["filename", "connector", "element-type-accuracy"] + headers = ["filename", "doctype", "connector", "element-type-accuracy"] _write_to_file(export_dir, "all-docs-element-type-frequency.tsv", rows, headers) agg_rows = []