mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-05 19:42:27 +00:00
chore: add doctype to ingest evaluation functions (#1977)
### Summary To combine ingest and holistic metrics efforts, add the `doctype` field to the results from the functions in evaluate.py for use in subsequent aggregation functions. ### Test Run `sh ./test_unstructured_ingest/evaluation-metrics.sh text-extraction` and there will be a new doctype column with the file's doctype extension. <img width="508" alt="Screenshot 2023-11-01 at 2 23 11 PM" src="https://github.com/Unstructured-IO/unstructured/assets/42684285/44583da9-e7ef-4142-be72-c2247b954bcf"> --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: shreyanid <shreyanid@users.noreply.github.com>
This commit is contained in:
parent
d07baed4a1
commit
c24e6e056c
@ -1,7 +1,8 @@
|
||||
## 0.10.29-dev8
|
||||
## 0.10.29-dev9
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Add doctype field to CI metric functions** Adds a doctype column to the ingest metric sheets for use in subsequent aggregations.
|
||||
* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
|
||||
* **Separate chipper tests** Chipper tests are long-running and require special access, so the tests have been separated into their own file under their own marker, and now have a separate `make` target.
|
||||
* **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
|
||||
|
||||
@ -230,27 +230,6 @@
|
||||
},
|
||||
"text": "!image"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
|
||||
"record_locator": {
|
||||
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
|
||||
"issue_key": "JCTP1-1"
|
||||
},
|
||||
"date_created": "2023-08-22T11:29:37.774000+00:00",
|
||||
"date_modified": "2023-08-24T12:05:04.690000+00:00"
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"cat",
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "d1e0c46eef256237b23aa43bfbadb23c",
|
||||
@ -272,27 +251,6 @@
|
||||
},
|
||||
"text": "20230823"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
|
||||
"record_locator": {
|
||||
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
|
||||
"issue_key": "JCTP1-1"
|
||||
},
|
||||
"date_created": "2023-08-22T11:29:37.774000+00:00",
|
||||
"date_modified": "2023-08-24T12:05:04.690000+00:00"
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"cat",
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "69d89b316aa1ff82d60e8438f764b0cf",
|
||||
@ -419,27 +377,6 @@
|
||||
},
|
||||
"text": "https://unstructured"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
|
||||
"record_locator": {
|
||||
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
|
||||
"issue_key": "JCTP1-1"
|
||||
},
|
||||
"date_created": "2023-08-22T11:29:37.774000+00:00",
|
||||
"date_modified": "2023-08-24T12:05:04.690000+00:00"
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"cat",
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "1e32847769dc4a1588004a1dfdf10041",
|
||||
@ -461,27 +398,6 @@
|
||||
},
|
||||
"text": "jira"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
|
||||
"record_locator": {
|
||||
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
|
||||
"issue_key": "JCTP1-1"
|
||||
},
|
||||
"date_created": "2023-08-22T11:29:37.774000+00:00",
|
||||
"date_modified": "2023-08-24T12:05:04.690000+00:00"
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"cat",
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "c08c6acfff81cafe379f88061e6b71bf",
|
||||
@ -503,27 +419,6 @@
|
||||
},
|
||||
"text": "connector"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "https://unstructured-jira-connector-test.atlassian.net/browse/JCTP1-1",
|
||||
"record_locator": {
|
||||
"base_url": "https://unstructured-jira-connector-test.atlassian.net",
|
||||
"issue_key": "JCTP1-1"
|
||||
},
|
||||
"date_created": "2023-08-22T11:29:37.774000+00:00",
|
||||
"date_modified": "2023-08-24T12:05:04.690000+00:00"
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"cat",
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "d680679411e53e07cf66ab4bad00bfcd",
|
||||
|
||||
@ -764,22 +764,6 @@
|
||||
},
|
||||
"text": "Awa"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "example-docs/language-docs/UDHR_first_article_all.txt",
|
||||
"permissions_data": [
|
||||
{
|
||||
"mode": 33188
|
||||
}
|
||||
]
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "9989b925ebf5c724d61091d02fab6585",
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
strategy average sample_sd population_sd count
|
||||
cct-accuracy 0.798 0.083 0.072 4
|
||||
cct-%missing 0.087 0.037 0.032 4
|
||||
cct-%missing 0.089 0.04 0.035 4
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
filename connector cct-accuracy cct-%missing
|
||||
handbook-1p.docx box 0.974 0.03
|
||||
example-10k.html local 0.686 0.04
|
||||
IRS-form-1987.pdf azure 0.783 0.13
|
||||
filename doctype connector cct-accuracy cct-%missing
|
||||
science-exploration-1p.pptx pptx dropbox 0.861 0.093
|
||||
science-exploration-1p.pptx pptx box 0.861 0.093
|
||||
example-10k.html html local 0.686 0.037
|
||||
IRS-form-1987.pdf pdf azure 0.783 0.135
|
||||
|
||||
|
@ -1 +1 @@
|
||||
filename connector element-type-accuracy
|
||||
filename doctype connector element-type-accuracy
|
||||
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.10.29-dev8" # pragma: no cover
|
||||
__version__ = "0.10.29-dev9" # pragma: no cover
|
||||
|
||||
@ -98,21 +98,24 @@ def measure_text_edit_distance(
|
||||
accuracy_scores: List[float] = []
|
||||
percent_missing_scores: List[float] = []
|
||||
|
||||
# assumption: output file name convention is name-of-file.doc.json
|
||||
for doc in output_list: # type: ignore
|
||||
fn = (doc.split("/")[-1]).split(".json")[0]
|
||||
doctype = fn.rsplit(".", 1)[-1]
|
||||
fn_txt = fn + ".txt"
|
||||
connector = doc.split("/")[0]
|
||||
|
||||
if fn_txt in source_list: # type: ignore
|
||||
output_cct = elements_to_text(elements_from_json(os.path.join(output_dir, doc)))
|
||||
source_cct = _read_text(os.path.join(source_dir, fn_txt))
|
||||
accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3)
|
||||
percent_missing = round(calculate_percent_missing_text(output_cct, source_cct), 3)
|
||||
|
||||
rows.append([fn, connector, accuracy, percent_missing])
|
||||
rows.append([fn, doctype, connector, accuracy, percent_missing])
|
||||
accuracy_scores.append(accuracy)
|
||||
percent_missing_scores.append(percent_missing)
|
||||
|
||||
headers = ["filename", "connector", "cct-accuracy", "cct-%missing"]
|
||||
headers = ["filename", "doctype", "connector", "cct-accuracy", "cct-%missing"]
|
||||
_write_to_file(export_dir, "all-docs-cct.tsv", rows, headers)
|
||||
|
||||
agg_rows = []
|
||||
@ -187,15 +190,16 @@ def measure_element_type_accuracy(
|
||||
|
||||
for doc in output_list: # type: ignore
|
||||
fn = (doc.split("/")[-1]).split(".json")[0]
|
||||
doctype = fn.rsplit(".", 1)[-1]
|
||||
connector = doc.split("/")[0]
|
||||
if doc in source_list: # type: ignore
|
||||
output = get_element_type_frequency(_read_text(os.path.join(output_dir, doc)))
|
||||
source = get_element_type_frequency(_read_text(os.path.join(source_dir, doc)))
|
||||
accuracy = round(calculate_element_type_percent_match(output, source), 3)
|
||||
rows.append([fn, connector, accuracy])
|
||||
rows.append([fn, doctype, connector, accuracy])
|
||||
accuracy_scores.append(accuracy)
|
||||
|
||||
headers = ["filename", "connector", "element-type-accuracy"]
|
||||
headers = ["filename", "doctype", "connector", "element-type-accuracy"]
|
||||
_write_to_file(export_dir, "all-docs-element-type-frequency.tsv", rows, headers)
|
||||
|
||||
agg_rows = []
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user