chore: add doctype to ingest evaluation functions (#1977)

### Summary
To combine ingest and holistic metrics efforts, add the `doctype` field
to the results from the functions in evaluate.py for use in subsequent
aggregation functions.

### Test
Run `sh ./test_unstructured_ingest/evaluation-metrics.sh
text-extraction` and there will be a new doctype column with the file's
doctype extension.
<img width="508" alt="Screenshot 2023-11-01 at 2 23 11 PM"
src="https://github.com/Unstructured-IO/unstructured/assets/42684285/44583da9-e7ef-4142-be72-c2247b954bcf">

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: shreyanid <shreyanid@users.noreply.github.com>
This commit is contained in:
shreyanid 2023-11-02 12:15:53 -07:00 committed by GitHub
parent d07baed4a1
commit c24e6e056c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 18 additions and 133 deletions

View File

@ -1,7 +1,8 @@
## 0.10.29-dev8
## 0.10.29-dev9
### Enhancements
* **Add doctype field to CI metric functions** Adds a doctype column to the ingest metric sheets for use in subsequent aggregations.
* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
* **Separate chipper tests** Chipper tests are long-running and require special access, so the tests have been separated into their own file under their own marker, and now have a separate `make` target.
* **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.

View File

@ -230,27 +230,6 @@
},
"text": "!image"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": ""
},
{
"type": "UncategorizedText",
"element_id": "d1e0c46eef256237b23aa43bfbadb23c",
@ -272,27 +251,6 @@
},
"text": "20230823"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": ""
},
{
"type": "UncategorizedText",
"element_id": "69d89b316aa1ff82d60e8438f764b0cf",
@ -419,27 +377,6 @@
},
"text": "https://unstructured"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": ""
},
{
"type": "Title",
"element_id": "1e32847769dc4a1588004a1dfdf10041",
@ -461,27 +398,6 @@
},
"text": "jira"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": ""
},
{
"type": "Title",
"element_id": "c08c6acfff81cafe379f88061e6b71bf",
@ -503,27 +419,6 @@
},
"text": "connector"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
"record_locator": {
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
"issue_key": "JCTP1-1"
},
"date_created": "2023-08-22T11:29:37.774000+00:00",
"date_modified": "2023-08-24T12:05:04.690000+00:00"
},
"filetype": "text/plain",
"languages": [
"cat",
"eng"
]
},
"text": ""
},
{
"type": "Title",
"element_id": "d680679411e53e07cf66ab4bad00bfcd",

View File

@ -764,22 +764,6 @@
},
"text": "Awa"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {
"url": "example-docs/language-docs/UDHR_first_article_all.txt",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "text/plain"
},
"text": ""
},
{
"type": "Title",
"element_id": "9989b925ebf5c724d61091d02fab6585",

View File

@ -1,3 +1,3 @@
strategy average sample_sd population_sd count
cct-accuracy 0.798 0.083 0.072 4
cct-%missing 0.087 0.037 0.032 4
cct-%missing 0.089 0.04 0.035 4

1 strategy average sample_sd population_sd count
2 cct-accuracy 0.798 0.083 0.072 4
3 cct-%missing 0.087 0.089 0.037 0.04 0.032 0.035 4

View File

@ -1,4 +1,5 @@
filename connector cct-accuracy cct-%missing
handbook-1p.docx box 0.974 0.03
example-10k.html local 0.686 0.04
IRS-form-1987.pdf azure 0.783 0.13
filename doctype connector cct-accuracy cct-%missing
science-exploration-1p.pptx pptx dropbox 0.861 0.093
science-exploration-1p.pptx pptx box 0.861 0.093
example-10k.html html local 0.686 0.037
IRS-form-1987.pdf pdf azure 0.783 0.135

1 filename doctype connector cct-accuracy cct-%missing
2 handbook-1p.docx science-exploration-1p.pptx pptx box dropbox 0.974 0.861 0.03 0.093
3 example-10k.html science-exploration-1p.pptx pptx local box 0.686 0.861 0.04 0.093
4 IRS-form-1987.pdf example-10k.html html azure local 0.783 0.686 0.13 0.037
5 IRS-form-1987.pdf pdf azure 0.783 0.135

View File

@ -1 +1 @@
filename connector element-type-accuracy
filename doctype connector element-type-accuracy

1 filename doctype connector element-type-accuracy

View File

@ -1 +1 @@
__version__ = "0.10.29-dev8" # pragma: no cover
__version__ = "0.10.29-dev9" # pragma: no cover

View File

@ -98,21 +98,24 @@ def measure_text_edit_distance(
accuracy_scores: List[float] = []
percent_missing_scores: List[float] = []
# assumption: output file name convention is name-of-file.doc.json
for doc in output_list: # type: ignore
fn = (doc.split("/")[-1]).split(".json")[0]
doctype = fn.rsplit(".", 1)[-1]
fn_txt = fn + ".txt"
connector = doc.split("/")[0]
if fn_txt in source_list: # type: ignore
output_cct = elements_to_text(elements_from_json(os.path.join(output_dir, doc)))
source_cct = _read_text(os.path.join(source_dir, fn_txt))
accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3)
percent_missing = round(calculate_percent_missing_text(output_cct, source_cct), 3)
rows.append([fn, connector, accuracy, percent_missing])
rows.append([fn, doctype, connector, accuracy, percent_missing])
accuracy_scores.append(accuracy)
percent_missing_scores.append(percent_missing)
headers = ["filename", "connector", "cct-accuracy", "cct-%missing"]
headers = ["filename", "doctype", "connector", "cct-accuracy", "cct-%missing"]
_write_to_file(export_dir, "all-docs-cct.tsv", rows, headers)
agg_rows = []
@ -187,15 +190,16 @@ def measure_element_type_accuracy(
for doc in output_list: # type: ignore
fn = (doc.split("/")[-1]).split(".json")[0]
doctype = fn.rsplit(".", 1)[-1]
connector = doc.split("/")[0]
if doc in source_list: # type: ignore
output = get_element_type_frequency(_read_text(os.path.join(output_dir, doc)))
source = get_element_type_frequency(_read_text(os.path.join(source_dir, doc)))
accuracy = round(calculate_element_type_percent_match(output, source), 3)
rows.append([fn, connector, accuracy])
rows.append([fn, doctype, connector, accuracy])
accuracy_scores.append(accuracy)
headers = ["filename", "connector", "element-type-accuracy"]
headers = ["filename", "doctype", "connector", "element-type-accuracy"]
_write_to_file(export_dir, "all-docs-element-type-frequency.tsv", rows, headers)
agg_rows = []