chore: add doctype to ingest evaluation functions (#1977)

### Summary To combine ingest and holistic metrics efforts, add the `doctype` field to the results from the functions in evaluate.py for use in subsequent aggregation functions. ### Test Run `sh ./test_unstructured_ingest/evaluation-metrics.sh text-extraction` and there will be a new doctype column with the file's doctype extension. <img width="508" alt="Screenshot 2023-11-01 at 2 23 11 PM" src="https://github.com/Unstructured-IO/unstructured/assets/42684285/44583da9-e7ef-4142-be72-c2247b954bcf"> --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: shreyanid <shreyanid@users.noreply.github.com>
2025-12-05 19:42:27 +00:00 · 2023-11-02 12:15:53 -07:00 · 2023-11-02 12:15:53 -07:00 · c24e6e056c
commit c24e6e056c
parent d07baed4a1
8 changed files with 18 additions and 133 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,7 +1,8 @@
-## 0.10.29-dev8
+## 0.10.29-dev9

 ### Enhancements

+* **Add doctype field to CI metric functions** Adds a doctype column to the ingest metric sheets for use in subsequent aggregations.
 * **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
 * **Separate chipper tests** Chipper tests are long-running and require special access, so the tests have been separated into their own file under their own marker, and now have a separate `make` target.
 * **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
--- a/test_unstructured_ingest/expected-structured-output/jira-diff/1/10000.json
+++ b/test_unstructured_ingest/expected-structured-output/jira-diff/1/10000.json
@ -230,27 +230,6 @@
    },
    "text": "!image"
  },
-  {
-    "type": "ListItem",
-    "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
-    "metadata": {
-      "data_source": {
-        "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
-        "record_locator": {
-          "base_url": "https://unstructured-jira-connector-test.atlassian.net",
-          "issue_key": "JCTP1-1"
-        },
-        "date_created": "2023-08-22T11:29:37.774000+00:00",
-        "date_modified": "2023-08-24T12:05:04.690000+00:00"
-      },
-      "filetype": "text/plain",
-      "languages": [
-        "cat",
-        "eng"
-      ]
-    },
-    "text": ""
-  },
  {
    "type": "UncategorizedText",
    "element_id": "d1e0c46eef256237b23aa43bfbadb23c",
@ -272,27 +251,6 @@
    },
    "text": "20230823"
  },
-  {
-    "type": "ListItem",
-    "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
-    "metadata": {
-      "data_source": {
-        "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
-        "record_locator": {
-          "base_url": "https://unstructured-jira-connector-test.atlassian.net",
-          "issue_key": "JCTP1-1"
-        },
-        "date_created": "2023-08-22T11:29:37.774000+00:00",
-        "date_modified": "2023-08-24T12:05:04.690000+00:00"
-      },
-      "filetype": "text/plain",
-      "languages": [
-        "cat",
-        "eng"
-      ]
-    },
-    "text": ""
-  },
  {
    "type": "UncategorizedText",
    "element_id": "69d89b316aa1ff82d60e8438f764b0cf",
@ -419,27 +377,6 @@
    },
    "text": "https://unstructured"
  },
-  {
-    "type": "ListItem",
-    "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
-    "metadata": {
-      "data_source": {
-        "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
-        "record_locator": {
-          "base_url": "https://unstructured-jira-connector-test.atlassian.net",
-          "issue_key": "JCTP1-1"
-        },
-        "date_created": "2023-08-22T11:29:37.774000+00:00",
-        "date_modified": "2023-08-24T12:05:04.690000+00:00"
-      },
-      "filetype": "text/plain",
-      "languages": [
-        "cat",
-        "eng"
-      ]
-    },
-    "text": ""
-  },
  {
    "type": "Title",
    "element_id": "1e32847769dc4a1588004a1dfdf10041",
@ -461,27 +398,6 @@
    },
    "text": "jira"
  },
-  {
-    "type": "ListItem",
-    "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
-    "metadata": {
-      "data_source": {
-        "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
-        "record_locator": {
-          "base_url": "https://unstructured-jira-connector-test.atlassian.net",
-          "issue_key": "JCTP1-1"
-        },
-        "date_created": "2023-08-22T11:29:37.774000+00:00",
-        "date_modified": "2023-08-24T12:05:04.690000+00:00"
-      },
-      "filetype": "text/plain",
-      "languages": [
-        "cat",
-        "eng"
-      ]
-    },
-    "text": ""
-  },
  {
    "type": "Title",
    "element_id": "c08c6acfff81cafe379f88061e6b71bf",
@ -503,27 +419,6 @@
    },
    "text": "connector"
  },
-  {
-    "type": "ListItem",
-    "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
-    "metadata": {
-      "data_source": {
-        "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
-        "record_locator": {
-          "base_url": "https://unstructured-jira-connector-test.atlassian.net",
-          "issue_key": "JCTP1-1"
-        },
-        "date_created": "2023-08-22T11:29:37.774000+00:00",
-        "date_modified": "2023-08-24T12:05:04.690000+00:00"
-      },
-      "filetype": "text/plain",
-      "languages": [
-        "cat",
-        "eng"
-      ]
-    },
-    "text": ""
-  },
  {
    "type": "Title",
    "element_id": "d680679411e53e07cf66ab4bad00bfcd",
--- a/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json
@ -764,22 +764,6 @@
    },
    "text": "Awa"
  },
-  {
-    "type": "ListItem",
-    "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
-    "metadata": {
-      "data_source": {
-        "url": "example-docs/language-docs/UDHR_first_article_all.txt",
-        "permissions_data": [
-          {
-            "mode": 33188
-          }
-        ]
-      },
-      "filetype": "text/plain"
-    },
-    "text": ""
-  },
  {
    "type": "Title",
    "element_id": "9989b925ebf5c724d61091d02fab6585",
--- a/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv
+++ b/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv
@ -1,3 +1,3 @@
 strategy	average	sample_sd	population_sd	count
 cct-accuracy	0.798	0.083	0.072	4
-cct-%missing	0.087	0.037	0.032	4
+cct-%missing	0.089	0.04	0.035	4
--- a/test_unstructured_ingest/metrics/all-docs-cct.tsv
+++ b/test_unstructured_ingest/metrics/all-docs-cct.tsv
@ -1,4 +1,5 @@
-filename	connector	cct-accuracy	cct-%missing
-handbook-1p.docx	box	0.974	0.03
-example-10k.html	local	0.686	0.04
-IRS-form-1987.pdf	azure	0.783	0.13
+filename	doctype	connector	cct-accuracy	cct-%missing
+science-exploration-1p.pptx	pptx	dropbox	0.861	0.093
+science-exploration-1p.pptx	pptx	box	0.861	0.093
+example-10k.html	html	local	0.686	0.037
+IRS-form-1987.pdf	pdf	azure	0.783	0.135
--- a/test_unstructured_ingest/metrics/all-docs-element-type-frequency.tsv
+++ b/test_unstructured_ingest/metrics/all-docs-element-type-frequency.tsv
@ -1 +1 @@
-filename	connector	element-type-accuracy
+filename	doctype	connector	element-type-accuracy
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.10.29-dev8"  # pragma: no cover
+__version__ = "0.10.29-dev9"  # pragma: no cover
--- a/unstructured/ingest/evaluate.py
+++ b/unstructured/ingest/evaluate.py
@ -98,21 +98,24 @@ def measure_text_edit_distance(
    accuracy_scores: List[float] = []
    percent_missing_scores: List[float] = []

+    # assumption: output file name convention is name-of-file.doc.json
    for doc in output_list:  # type: ignore
        fn = (doc.split("/")[-1]).split(".json")[0]
+        doctype = fn.rsplit(".", 1)[-1]
        fn_txt = fn + ".txt"
        connector = doc.split("/")[0]
+
        if fn_txt in source_list:  # type: ignore
            output_cct = elements_to_text(elements_from_json(os.path.join(output_dir, doc)))
            source_cct = _read_text(os.path.join(source_dir, fn_txt))
            accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3)
            percent_missing = round(calculate_percent_missing_text(output_cct, source_cct), 3)

-            rows.append([fn, connector, accuracy, percent_missing])
+            rows.append([fn, doctype, connector, accuracy, percent_missing])
            accuracy_scores.append(accuracy)
            percent_missing_scores.append(percent_missing)

-    headers = ["filename", "connector", "cct-accuracy", "cct-%missing"]
+    headers = ["filename", "doctype", "connector", "cct-accuracy", "cct-%missing"]
    _write_to_file(export_dir, "all-docs-cct.tsv", rows, headers)

    agg_rows = []
@ -187,15 +190,16 @@ def measure_element_type_accuracy(

    for doc in output_list:  # type: ignore
        fn = (doc.split("/")[-1]).split(".json")[0]
+        doctype = fn.rsplit(".", 1)[-1]
        connector = doc.split("/")[0]
        if doc in source_list:  # type: ignore
            output = get_element_type_frequency(_read_text(os.path.join(output_dir, doc)))
            source = get_element_type_frequency(_read_text(os.path.join(source_dir, doc)))
            accuracy = round(calculate_element_type_percent_match(output, source), 3)
-            rows.append([fn, connector, accuracy])
+            rows.append([fn, doctype, connector, accuracy])
            accuracy_scores.append(accuracy)

-    headers = ["filename", "connector", "element-type-accuracy"]
+    headers = ["filename", "doctype", "connector", "element-type-accuracy"]
    _write_to_file(export_dir, "all-docs-element-type-frequency.tsv", rows, headers)

    agg_rows = []