From c24e6e056c53e5bda1688c6f48339b52552988dc Mon Sep 17 00:00:00 2001
From: shreyanid <42684285+shreyanid@users.noreply.github.com>
Date: Thu, 2 Nov 2023 12:15:53 -0700
Subject: [PATCH] chore: add doctype to ingest evaluation functions (#1977)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Summary
To combine ingest and holistic metrics efforts, add the `doctype` field
to the results from the functions in evaluate.py for use in subsequent
aggregation functions.

### Test
Run `sh ./test_unstructured_ingest/evaluation-metrics.sh
text-extraction` and there will be a new doctype column with the file's
doctype extension.
<img width="508" alt="Screenshot 2023-11-01 at 2 23 11 PM"
src="https://github.com/Unstructured-IO/unstructured/assets/42684285/44583da9-e7ef-4142-be72-c2247b954bcf">

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: shreyanid <shreyanid@users.noreply.github.com>
---
 CHANGELOG.md                                  |   3 +-
 .../jira-diff/1/10000.json                    | 105 ------------------
 .../UDHR_first_article_all.txt.json           |  16 ---
 .../metrics/aggregate-scores-cct.tsv          |   2 +-
 .../metrics/all-docs-cct.tsv                  |   9 +-
 .../all-docs-element-type-frequency.tsv       |   2 +-
 unstructured/__version__.py                   |   2 +-
 unstructured/ingest/evaluate.py               |  12 +-
 8 files changed, 18 insertions(+), 133 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7f4977a2e..8d9c44c22 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,8 @@
-## 0.10.29-dev8
+## 0.10.29-dev9
 
 ### Enhancements
 
+* **Add doctype field to CI metric functions** Adds a doctype column to the ingest metric sheets for use in subsequent aggregations.
 * **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
 * **Separate chipper tests** Chipper tests are long-running and require special access, so the tests have been separated into their own file under their own marker, and now have a separate `make` target.
 * **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
diff --git a/test_unstructured_ingest/expected-structured-output/jira-diff/1/10000.json b/test_unstructured_ingest/expected-structured-output/jira-diff/1/10000.json
index 9218aea30..d03951469 100644
--- a/test_unstructured_ingest/expected-structured-output/jira-diff/1/10000.json
+++ b/test_unstructured_ingest/expected-structured-output/jira-diff/1/10000.json
@@ -230,27 +230,6 @@
     },
     "text": "!image"
   },
-  {
-    "type": "ListItem",
-    "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
-    "metadata": {
-      "data_source": {
-        "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
-        "record_locator": {
-          "base_url": "https://unstructured-jira-connector-test.atlassian.net",
-          "issue_key": "JCTP1-1"
-        },
-        "date_created": "2023-08-22T11:29:37.774000+00:00",
-        "date_modified": "2023-08-24T12:05:04.690000+00:00"
-      },
-      "filetype": "text/plain",
-      "languages": [
-        "cat",
-        "eng"
-      ]
-    },
-    "text": ""
-  },
   {
     "type": "UncategorizedText",
     "element_id": "d1e0c46eef256237b23aa43bfbadb23c",
@@ -272,27 +251,6 @@
     },
     "text": "20230823"
   },
-  {
-    "type": "ListItem",
-    "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
-    "metadata": {
-      "data_source": {
-        "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
-        "record_locator": {
-          "base_url": "https://unstructured-jira-connector-test.atlassian.net",
-          "issue_key": "JCTP1-1"
-        },
-        "date_created": "2023-08-22T11:29:37.774000+00:00",
-        "date_modified": "2023-08-24T12:05:04.690000+00:00"
-      },
-      "filetype": "text/plain",
-      "languages": [
-        "cat",
-        "eng"
-      ]
-    },
-    "text": ""
-  },
   {
     "type": "UncategorizedText",
     "element_id": "69d89b316aa1ff82d60e8438f764b0cf",
@@ -419,27 +377,6 @@
     },
     "text": "https://unstructured"
   },
-  {
-    "type": "ListItem",
-    "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
-    "metadata": {
-      "data_source": {
-        "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
-        "record_locator": {
-          "base_url": "https://unstructured-jira-connector-test.atlassian.net",
-          "issue_key": "JCTP1-1"
-        },
-        "date_created": "2023-08-22T11:29:37.774000+00:00",
-        "date_modified": "2023-08-24T12:05:04.690000+00:00"
-      },
-      "filetype": "text/plain",
-      "languages": [
-        "cat",
-        "eng"
-      ]
-    },
-    "text": ""
-  },
   {
     "type": "Title",
     "element_id": "1e32847769dc4a1588004a1dfdf10041",
@@ -461,27 +398,6 @@
     },
     "text": "jira"
   },
-  {
-    "type": "ListItem",
-    "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
-    "metadata": {
-      "data_source": {
-        "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
-        "record_locator": {
-          "base_url": "https://unstructured-jira-connector-test.atlassian.net",
-          "issue_key": "JCTP1-1"
-        },
-        "date_created": "2023-08-22T11:29:37.774000+00:00",
-        "date_modified": "2023-08-24T12:05:04.690000+00:00"
-      },
-      "filetype": "text/plain",
-      "languages": [
-        "cat",
-        "eng"
-      ]
-    },
-    "text": ""
-  },
   {
     "type": "Title",
     "element_id": "c08c6acfff81cafe379f88061e6b71bf",
@@ -503,27 +419,6 @@
     },
     "text": "connector"
   },
-  {
-    "type": "ListItem",
-    "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
-    "metadata": {
-      "data_source": {
-        "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
-        "record_locator": {
-          "base_url": "https://unstructured-jira-connector-test.atlassian.net",
-          "issue_key": "JCTP1-1"
-        },
-        "date_created": "2023-08-22T11:29:37.774000+00:00",
-        "date_modified": "2023-08-24T12:05:04.690000+00:00"
-      },
-      "filetype": "text/plain",
-      "languages": [
-        "cat",
-        "eng"
-      ]
-    },
-    "text": ""
-  },
   {
     "type": "Title",
     "element_id": "d680679411e53e07cf66ab4bad00bfcd",
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json b/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json
index a69276ed1..4ba614f91 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json
@@ -764,22 +764,6 @@
     },
     "text": "Awa"
   },
-  {
-    "type": "ListItem",
-    "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
-    "metadata": {
-      "data_source": {
-        "url": "example-docs/language-docs/UDHR_first_article_all.txt",
-        "permissions_data": [
-          {
-            "mode": 33188
-          }
-        ]
-      },
-      "filetype": "text/plain"
-    },
-    "text": ""
-  },
   {
     "type": "Title",
     "element_id": "9989b925ebf5c724d61091d02fab6585",
diff --git a/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv b/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv
index 0054e9a7e..e362972a4 100644
--- a/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv
+++ b/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv
@@ -1,3 +1,3 @@
 strategy	average	sample_sd	population_sd	count
 cct-accuracy	0.798	0.083	0.072	4
-cct-%missing	0.087	0.037	0.032	4
+cct-%missing	0.089	0.04	0.035	4
diff --git a/test_unstructured_ingest/metrics/all-docs-cct.tsv b/test_unstructured_ingest/metrics/all-docs-cct.tsv
index 7b714a590..69ffeaaab 100644
--- a/test_unstructured_ingest/metrics/all-docs-cct.tsv
+++ b/test_unstructured_ingest/metrics/all-docs-cct.tsv
@@ -1,4 +1,5 @@
-filename	connector	cct-accuracy	cct-%missing
-handbook-1p.docx	box	0.974	0.03
-example-10k.html	local	0.686	0.04
-IRS-form-1987.pdf	azure	0.783	0.13
+filename	doctype	connector	cct-accuracy	cct-%missing
+science-exploration-1p.pptx	pptx	dropbox	0.861	0.093
+science-exploration-1p.pptx	pptx	box	0.861	0.093
+example-10k.html	html	local	0.686	0.037
+IRS-form-1987.pdf	pdf	azure	0.783	0.135
diff --git a/test_unstructured_ingest/metrics/all-docs-element-type-frequency.tsv b/test_unstructured_ingest/metrics/all-docs-element-type-frequency.tsv
index 50d494248..b5fc1d7d0 100644
--- a/test_unstructured_ingest/metrics/all-docs-element-type-frequency.tsv
+++ b/test_unstructured_ingest/metrics/all-docs-element-type-frequency.tsv
@@ -1 +1 @@
-filename	connector	element-type-accuracy
+filename	doctype	connector	element-type-accuracy
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index c95c99598..6e0192bbe 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.29-dev8"  # pragma: no cover
+__version__ = "0.10.29-dev9"  # pragma: no cover
diff --git a/unstructured/ingest/evaluate.py b/unstructured/ingest/evaluate.py
index ff78b1a9a..ef8f72d17 100755
--- a/unstructured/ingest/evaluate.py
+++ b/unstructured/ingest/evaluate.py
@@ -98,21 +98,24 @@ def measure_text_edit_distance(
     accuracy_scores: List[float] = []
     percent_missing_scores: List[float] = []
 
+    # assumption: output file name convention is name-of-file.doc.json
     for doc in output_list:  # type: ignore
         fn = (doc.split("/")[-1]).split(".json")[0]
+        doctype = fn.rsplit(".", 1)[-1]
         fn_txt = fn + ".txt"
         connector = doc.split("/")[0]
+
         if fn_txt in source_list:  # type: ignore
             output_cct = elements_to_text(elements_from_json(os.path.join(output_dir, doc)))
             source_cct = _read_text(os.path.join(source_dir, fn_txt))
             accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3)
             percent_missing = round(calculate_percent_missing_text(output_cct, source_cct), 3)
 
-            rows.append([fn, connector, accuracy, percent_missing])
+            rows.append([fn, doctype, connector, accuracy, percent_missing])
             accuracy_scores.append(accuracy)
             percent_missing_scores.append(percent_missing)
 
-    headers = ["filename", "connector", "cct-accuracy", "cct-%missing"]
+    headers = ["filename", "doctype", "connector", "cct-accuracy", "cct-%missing"]
     _write_to_file(export_dir, "all-docs-cct.tsv", rows, headers)
 
     agg_rows = []
@@ -187,15 +190,16 @@ def measure_element_type_accuracy(
 
     for doc in output_list:  # type: ignore
         fn = (doc.split("/")[-1]).split(".json")[0]
+        doctype = fn.rsplit(".", 1)[-1]
         connector = doc.split("/")[0]
         if doc in source_list:  # type: ignore
             output = get_element_type_frequency(_read_text(os.path.join(output_dir, doc)))
             source = get_element_type_frequency(_read_text(os.path.join(source_dir, doc)))
             accuracy = round(calculate_element_type_percent_match(output, source), 3)
-            rows.append([fn, connector, accuracy])
+            rows.append([fn, doctype, connector, accuracy])
             accuracy_scores.append(accuracy)
 
-    headers = ["filename", "connector", "element-type-accuracy"]
+    headers = ["filename", "doctype", "connector", "element-type-accuracy"]
     _write_to_file(export_dir, "all-docs-element-type-frequency.tsv", rows, headers)
 
     agg_rows = []