From c24e6e056c53e5bda1688c6f48339b52552988dc Mon Sep 17 00:00:00 2001
From: shreyanid <42684285+shreyanid@users.noreply.github.com>
Date: Thu, 2 Nov 2023 12:15:53 -0700
Subject: [PATCH] chore: add doctype to ingest evaluation functions (#1977)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
### Summary
To combine ingest and holistic metrics efforts, add the `doctype` field
to the results from the functions in evaluate.py for use in subsequent
aggregation functions.
### Test
Run `sh ./test_unstructured_ingest/evaluation-metrics.sh
text-extraction` and there will be a new doctype column with the file's
doctype extension.
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: shreyanid
---
CHANGELOG.md | 3 +-
.../jira-diff/1/10000.json | 105 ------------------
.../UDHR_first_article_all.txt.json | 16 ---
.../metrics/aggregate-scores-cct.tsv | 2 +-
.../metrics/all-docs-cct.tsv | 9 +-
.../all-docs-element-type-frequency.tsv | 2 +-
unstructured/__version__.py | 2 +-
unstructured/ingest/evaluate.py | 12 +-
8 files changed, 18 insertions(+), 133 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7f4977a2e..8d9c44c22 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,8 @@
-## 0.10.29-dev8
+## 0.10.29-dev9
### Enhancements
+* **Add doctype field to CI metric functions** Adds a doctype column to the ingest metric sheets for use in subsequent aggregations.
* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
* **Separate chipper tests** Chipper tests are long-running and require special access, so the tests have been separated into their own file under their own marker, and now have a separate `make` target.
* **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
diff --git a/test_unstructured_ingest/expected-structured-output/jira-diff/1/10000.json b/test_unstructured_ingest/expected-structured-output/jira-diff/1/10000.json
index 9218aea30..d03951469 100644
--- a/test_unstructured_ingest/expected-structured-output/jira-diff/1/10000.json
+++ b/test_unstructured_ingest/expected-structured-output/jira-diff/1/10000.json
@@ -230,27 +230,6 @@
},
"text": "!image"
},
- {
- "type": "ListItem",
- "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
- "metadata": {
- "data_source": {
- "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
- "record_locator": {
- "base_url": "https://unstructured-jira-connector-test.atlassian.net",
- "issue_key": "JCTP1-1"
- },
- "date_created": "2023-08-22T11:29:37.774000+00:00",
- "date_modified": "2023-08-24T12:05:04.690000+00:00"
- },
- "filetype": "text/plain",
- "languages": [
- "cat",
- "eng"
- ]
- },
- "text": ""
- },
{
"type": "UncategorizedText",
"element_id": "d1e0c46eef256237b23aa43bfbadb23c",
@@ -272,27 +251,6 @@
},
"text": "20230823"
},
- {
- "type": "ListItem",
- "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
- "metadata": {
- "data_source": {
- "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
- "record_locator": {
- "base_url": "https://unstructured-jira-connector-test.atlassian.net",
- "issue_key": "JCTP1-1"
- },
- "date_created": "2023-08-22T11:29:37.774000+00:00",
- "date_modified": "2023-08-24T12:05:04.690000+00:00"
- },
- "filetype": "text/plain",
- "languages": [
- "cat",
- "eng"
- ]
- },
- "text": ""
- },
{
"type": "UncategorizedText",
"element_id": "69d89b316aa1ff82d60e8438f764b0cf",
@@ -419,27 +377,6 @@
},
"text": "https://unstructured"
},
- {
- "type": "ListItem",
- "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
- "metadata": {
- "data_source": {
- "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
- "record_locator": {
- "base_url": "https://unstructured-jira-connector-test.atlassian.net",
- "issue_key": "JCTP1-1"
- },
- "date_created": "2023-08-22T11:29:37.774000+00:00",
- "date_modified": "2023-08-24T12:05:04.690000+00:00"
- },
- "filetype": "text/plain",
- "languages": [
- "cat",
- "eng"
- ]
- },
- "text": ""
- },
{
"type": "Title",
"element_id": "1e32847769dc4a1588004a1dfdf10041",
@@ -461,27 +398,6 @@
},
"text": "jira"
},
- {
- "type": "ListItem",
- "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
- "metadata": {
- "data_source": {
- "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
- "record_locator": {
- "base_url": "https://unstructured-jira-connector-test.atlassian.net",
- "issue_key": "JCTP1-1"
- },
- "date_created": "2023-08-22T11:29:37.774000+00:00",
- "date_modified": "2023-08-24T12:05:04.690000+00:00"
- },
- "filetype": "text/plain",
- "languages": [
- "cat",
- "eng"
- ]
- },
- "text": ""
- },
{
"type": "Title",
"element_id": "c08c6acfff81cafe379f88061e6b71bf",
@@ -503,27 +419,6 @@
},
"text": "connector"
},
- {
- "type": "ListItem",
- "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
- "metadata": {
- "data_source": {
- "url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
- "record_locator": {
- "base_url": "https://unstructured-jira-connector-test.atlassian.net",
- "issue_key": "JCTP1-1"
- },
- "date_created": "2023-08-22T11:29:37.774000+00:00",
- "date_modified": "2023-08-24T12:05:04.690000+00:00"
- },
- "filetype": "text/plain",
- "languages": [
- "cat",
- "eng"
- ]
- },
- "text": ""
- },
{
"type": "Title",
"element_id": "d680679411e53e07cf66ab4bad00bfcd",
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json b/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json
index a69276ed1..4ba614f91 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json
@@ -764,22 +764,6 @@
},
"text": "Awa"
},
- {
- "type": "ListItem",
- "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
- "metadata": {
- "data_source": {
- "url": "example-docs/language-docs/UDHR_first_article_all.txt",
- "permissions_data": [
- {
- "mode": 33188
- }
- ]
- },
- "filetype": "text/plain"
- },
- "text": ""
- },
{
"type": "Title",
"element_id": "9989b925ebf5c724d61091d02fab6585",
diff --git a/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv b/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv
index 0054e9a7e..e362972a4 100644
--- a/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv
+++ b/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv
@@ -1,3 +1,3 @@
strategy average sample_sd population_sd count
cct-accuracy 0.798 0.083 0.072 4
-cct-%missing 0.087 0.037 0.032 4
+cct-%missing 0.089 0.04 0.035 4
diff --git a/test_unstructured_ingest/metrics/all-docs-cct.tsv b/test_unstructured_ingest/metrics/all-docs-cct.tsv
index 7b714a590..69ffeaaab 100644
--- a/test_unstructured_ingest/metrics/all-docs-cct.tsv
+++ b/test_unstructured_ingest/metrics/all-docs-cct.tsv
@@ -1,4 +1,5 @@
-filename connector cct-accuracy cct-%missing
-handbook-1p.docx box 0.974 0.03
-example-10k.html local 0.686 0.04
-IRS-form-1987.pdf azure 0.783 0.13
+filename doctype connector cct-accuracy cct-%missing
+science-exploration-1p.pptx pptx dropbox 0.861 0.093
+science-exploration-1p.pptx pptx box 0.861 0.093
+example-10k.html html local 0.686 0.037
+IRS-form-1987.pdf pdf azure 0.783 0.135
diff --git a/test_unstructured_ingest/metrics/all-docs-element-type-frequency.tsv b/test_unstructured_ingest/metrics/all-docs-element-type-frequency.tsv
index 50d494248..b5fc1d7d0 100644
--- a/test_unstructured_ingest/metrics/all-docs-element-type-frequency.tsv
+++ b/test_unstructured_ingest/metrics/all-docs-element-type-frequency.tsv
@@ -1 +1 @@
-filename connector element-type-accuracy
+filename doctype connector element-type-accuracy
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index c95c99598..6e0192bbe 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.29-dev8" # pragma: no cover
+__version__ = "0.10.29-dev9" # pragma: no cover
diff --git a/unstructured/ingest/evaluate.py b/unstructured/ingest/evaluate.py
index ff78b1a9a..ef8f72d17 100755
--- a/unstructured/ingest/evaluate.py
+++ b/unstructured/ingest/evaluate.py
@@ -98,21 +98,24 @@ def measure_text_edit_distance(
accuracy_scores: List[float] = []
percent_missing_scores: List[float] = []
+ # assumption: output file name convention is name-of-file.doc.json
for doc in output_list: # type: ignore
fn = (doc.split("/")[-1]).split(".json")[0]
+ doctype = fn.rsplit(".", 1)[-1]
fn_txt = fn + ".txt"
connector = doc.split("/")[0]
+
if fn_txt in source_list: # type: ignore
output_cct = elements_to_text(elements_from_json(os.path.join(output_dir, doc)))
source_cct = _read_text(os.path.join(source_dir, fn_txt))
accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3)
percent_missing = round(calculate_percent_missing_text(output_cct, source_cct), 3)
- rows.append([fn, connector, accuracy, percent_missing])
+ rows.append([fn, doctype, connector, accuracy, percent_missing])
accuracy_scores.append(accuracy)
percent_missing_scores.append(percent_missing)
- headers = ["filename", "connector", "cct-accuracy", "cct-%missing"]
+ headers = ["filename", "doctype", "connector", "cct-accuracy", "cct-%missing"]
_write_to_file(export_dir, "all-docs-cct.tsv", rows, headers)
agg_rows = []
@@ -187,15 +190,16 @@ def measure_element_type_accuracy(
for doc in output_list: # type: ignore
fn = (doc.split("/")[-1]).split(".json")[0]
+ doctype = fn.rsplit(".", 1)[-1]
connector = doc.split("/")[0]
if doc in source_list: # type: ignore
output = get_element_type_frequency(_read_text(os.path.join(output_dir, doc)))
source = get_element_type_frequency(_read_text(os.path.join(source_dir, doc)))
accuracy = round(calculate_element_type_percent_match(output, source), 3)
- rows.append([fn, connector, accuracy])
+ rows.append([fn, doctype, connector, accuracy])
accuracy_scores.append(accuracy)
- headers = ["filename", "connector", "element-type-accuracy"]
+ headers = ["filename", "doctype", "connector", "element-type-accuracy"]
_write_to_file(export_dir, "all-docs-element-type-frequency.tsv", rows, headers)
agg_rows = []