feat(eval): table evaluation metrics (#2558)

This PR adds new table evaluation metrics prepared by @leah1985 
The metrics include:
- `table count` (check)
- `table_level_acc` - accuracy of table detection
- `element_col_level_index_acc` - accuracy of cell detection in columns
- `element_row_level_index_acc` - accuracy of cell detection in rows
- `element_col_level_content_acc` - accuracy of content detected in
columns
- `element_row_level_content_acc` - accuracy of content detected in rows

TODO in next steps:
- create a minimal dataset and upload to s3 for ingest tests
- generate and add metrics on the above dataset to
`test_unstructured_ingest/metrics`
This commit is contained in:
Pawel Kmiecik 2024-02-22 17:35:46 +01:00 committed by GitHub
parent 1947375b2e
commit ff9d46f9dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 2334 additions and 3 deletions

View File

@ -20,8 +20,8 @@ repos:
args: ["--line-length=100"] args: ["--line-length=100"]
language_version: python3 language_version: python3
- repo: https://github.com/charliermarsh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: "v0.0.230" rev: v0.2.1
hooks: hooks:
- id: ruff - id: ruff
args: args:

View File

@ -5,6 +5,7 @@
### Features ### Features
* **Add parent_element to overlapping case output** Adds parent_element to the output for `identify_overlapping_or_nesting_case` and `catch_overlapping_and_nested_bboxes` functions. * **Add parent_element to overlapping case output** Adds parent_element to the output for `identify_overlapping_or_nesting_case` and `catch_overlapping_and_nested_bboxes` functions.
* **Add table structure evaluation** Adds a new function to evaluate the structure of a table and return a metric that represents the quality of the table structure. This function is used to evaluate the quality of the table structure and the table contents.
### Fixes ### Fixes

View File

@ -0,0 +1,599 @@
[
{
"element_id": "32fd0777e6bf9ab8ffdf0bc98d236660",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 1
},
"text": "CAUTION: NOT FOR FILING",
"type": "Title"
},
{
"element_id": "2415b0b6c0a4a1193a0b2c1f72c2687a",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Form 1095-A is provided here for informational purposes only. Health Insurance Marketplaces use Form 1095-A to report information on enrollments in a qualified health plan in the individual market through the Marketplace. As the form is to be completed by the Marketplaces, individuals cannot complete and use Form 1095-A available on IRS.gov. Individuals receiving a completed Form 1095-A from the Health Insurance Marketplace will use the information received on the form and the guidance in the instructions to assist them in filing an accurate tax return.",
"type": "NarrativeText"
},
{
"element_id": "c78cc8bc220ef978871c8ce7b77aca9e",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "Form 1095-A",
"type": "Title"
},
{
"element_id": "4f4b828be49912732b002f7d975f2a47",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "Health Insurance Marketplace Statement",
"type": "Header"
},
{
"element_id": "90251d5ff4018bf08f45ca85f613ed0b",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "VOID",
"type": "Title"
},
{
"element_id": "1ea2600ce51a288c15e846b7d5f36c57",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "OMB No. 1545-2232",
"type": "Header"
},
{
"element_id": "d03a8a9f07e60c8122532448b03b32f3",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "Department of the Treasury Internal Revenue Service",
"type": "NarrativeText"
},
{
"element_id": "c06c1be4d49b392523cb6d468f7069e1",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "Do not attach to your tax return. Keep for your records. Go to www.irs.gov/Form1095A for instructions and the latest information.",
"type": "NarrativeText"
},
{
"element_id": "9ba6c34da4e65b2a4eede121ef45c20c",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "CORRECTED",
"type": "Title"
},
{
"element_id": "0c209ae41d8dc31972b9ae2255c78c5d",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "2023 ",
"type": "Image"
},
{
"element_id": "1cede1791a3bf0486bb15fcef30d8293",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "Part I Recipient Information",
"type": "Title"
},
{
"element_id": "ff474028197add36ffdec72cedb778bd",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2,
"text_as_html": "<table><thead><th>1</th><th>Marketplace identifier</th><th>2 Marketplace-assigned policy number</th><th colspan=\"3\">3 Policy issuers name</th></thead><thead><th>4</th><th>Recipients name</th><th></th><th>5 Recipients SSN</th><th>6</th><th>Recipients date of birth</th></thead><tr><td>7</td><td>Recipients spouses name</td><td></td><td>8 Recipients spouses SSN</td><td>9</td><td>Recipients spouses date of birth</td></tr><tr><td>10</td><td>Policy start date</td><td>11 Policy termination date</td><td colspan=\"3\">12 Street address (including apartment no.)</td></tr><tr><td>13</td><td>City or town</td><td>14 State or province</td><td colspan=\"3\">15 Country and ZIP or foreign postal code</td></tr></table>"
},
"text": "1 Marketplace identifier 2 Marketplace-assigned policy number 3 Policy issuers name 4 Recipients name 5 Recipients SSN 6 Recipients date of birth 7 Recipients spouses name 8 Recipients spouses SSN 9 Recipients spouses date of birth 10 Policy start date 11 Policy termination date 12 Street address (including apartment no.) 13 City or town 14 State or province 15 Country and ZIP or foreign postal code",
"type": "Table"
},
{
"element_id": "7f740a165e0bc921cb2e9b0d24399743",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "Part II Covered Individuals",
"type": "Title"
},
{
"element_id": "c8e5c7d9cef4d7d42f7576a6f0257912",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2,
"text_as_html": "<table><thead><th>A. Covered individual name</th><th>B. Covered individual SSN</th><th>C. Covered individual date of birth</th><th>D. Coverage start date</th><th>E. Coverage termination date</th></thead><tr><td colspan=\"5\">16</td></tr><tr><td colspan=\"5\">17</td></tr><tr><td>18</td><td></td><td></td><td></td><td></td></tr><tr><td colspan=\"5\">19</td></tr><tr><td>20</td><td></td><td></td><td></td><td></td></tr></table>"
},
"text": "A. Covered individual name B. Covered individual SSN C. Covered individual date of birth D. Coverage start date E. Coverage termination date 16 17 18 19 20",
"type": "Table"
},
{
"element_id": "cec692aeb8c9a0f36906c9784823943f",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "Part III Coverage Information",
"type": "Title"
},
{
"element_id": "168cf6186d92272d26f9c486b640ea4d",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2,
"text_as_html": "<table><thead><th></th><th>Month A.</th><th>Monthly enrollment</th><th>premiums B. Monthly second lowest cost silver plan (SLCSP) premium</th><th>C. Monthly advance payment of premium tax credit</th></thead><tr><td>22</td><td>February</td><td></td><td></td><td></td></tr><tr><td>23</td><td>March</td><td></td><td></td><td></td></tr><tr><td>24</td><td>April</td><td></td><td></td><td></td></tr><tr><td>25</td><td>May</td><td></td><td></td><td></td></tr><tr><td>26</td><td>June</td><td></td><td></td><td></td></tr><tr><td>27</td><td>July</td><td></td><td></td><td></td></tr><tr><td>28</td><td>August</td><td></td><td></td><td></td></tr><tr><td>29</td><td>September</td><td></td><td></td><td></td></tr><tr><td>30</td><td>October</td><td></td><td></td><td></td></tr><tr><td>31</td><td>November</td><td></td><td></td><td></td></tr><tr><td>32</td><td>December</td><td></td><td></td><td></td></tr></table>"
},
"text": "Month A. Monthly enrollment premiums B. Monthly second lowest cost silver plan (SLCSP) premium C. Monthly advance payment of premium tax credit 21 January 22 February 23 March 24 April 25 May 26 June 27 July 28 August 29 September 30 October 31 November 32 December",
"type": "Table"
},
{
"element_id": "c508f4f0fea52ddb36fd48c15d3a04bc",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "33 Annual Totals For Privacy Act and Paperwork Reduction Act Notice, see separate instructions.",
"type": "UncategorizedText"
},
{
"element_id": "21c2f28c80dac74fdd2697e558a061fe",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "Cat. No. 60703Q",
"type": "UncategorizedText"
},
{
"element_id": "db879eb2cb861d538e88bad00211ebb3",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "Form 1095-A (2023)",
"type": "UncategorizedText"
},
{
"element_id": "04569cac0eb6a52ca631ada9921a8543",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 3
},
"text": "Form 1095-A (2023)",
"type": "Header"
},
{
"element_id": "2a862b1ef802bf890f6a83d2e20f2ca2",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 3
},
"text": "Instructions for Recipient You received this Form 1095-A because you or a family member enrolled in health insurance coverage through the Health Insurance Marketplace. This Form 1095-A provides information you need to complete Form 8962, Premium Tax Credit (PTC). You must complete Form 8962 and file it with your tax return (Form 1040, Form 1040-SR, or Form 1040-NR) if any amount other than zero is shown in Part III, column C, of this Form 1095-A (meaning that you received premium assistance through advance payments of the premium tax credit (also called advance credit payments)) or if you want to take the premium tax credit. The filing requirement applies whether or not youre otherwise required to file a tax return. If you are filing Form 8962, you cannot file Form 1040-NR-EZ, Form 1040-SS, or Form 1040-PR. The Marketplace has also reported the information on this form to the IRS. If you or your family members enrolled at the Marketplace in more than one qualified health plan policy, you will receive a Form 1095-A for each policy. Check the information on this form carefully. Please contact your Marketplace if you have questions concerning its accuracy. If you or your family members were enrolled in a Marketplace catastrophic health plan or separate dental policy, you arent entitled to take a premium tax credit for this coverage when you file your return, even if you received a Form 1095-A for this coverage. For additional information related to Form 1095-A, go to www.irs.gov/Affordable-Care-Act/Individuals-and- Families/Health-Insurance-Marketplace-Statements. Additional information. For additional information about the tax provisions of the Affordable Care Act (ACA), including the premium tax credit, see www.irs.gov/Affordable-Care-Act/Individuals-and-Families or call the IRS Healthcare Hotline for ACA questions (800-919-0452). VOID box. If the “VOID” box is checked at the top of the form, you previously received a Form 1095-A for the policy described in Part I. That Form 1095-A was sent in error. You shouldnt have received a Form 1095-A for this policy. Dont use the information on this or the previously received Form 1095-A to figure your premium tax credit on Form 8962. CORRECTED box. If the “CORRECTED” box is checked at the top of the form, use the information on this Form 1095-A to figure the premium tax credit and reconcile any advance credit payments on Form 8962. Dont use the information on the original Form 1095-A you received for this policy. Part I. Recipient Information, lines 115. Part I reports information about you, the insurance company that issued your policy, and the Marketplace where you enrolled in the coverage. Line 1. This line identifies the state where you enrolled in coverage through the Marketplace. Line 2. This line is the policy number assigned by the Marketplace to identify the policy in which you enrolled. If you are completing Part IV of Form 8962, enter this number on line 30, 31, 32, or 33, box a. Line 3. This is the name of the insurance company that issued your policy. Line 4. You are the recipient because you are the person the Marketplace identified at enrollment who is expected to file a tax return and who, if qualified, would take the premium tax credit for the year of coverage. Line 5. This is your social security number (SSN). For your protection, this form may show only the last four digits. However, the Marketplace has reported your complete SSN to the IRS. Line 6. A date of birth will be entered if there is no SSN on line 5. Lines 7, 8, and 9. Information about your spouse will be entered only if advance credit payments were made for your coverage. The date of birth will be entered on line 9 only if line 8 is blank. Lines 10 and 11. These are the starting and ending dates of the policy. Lines 12 through 15. Your address is entered on these lines. Part II. Covered Individuals, lines 1620. Part II reports information about each individual who is covered under your policy. This information includes the name, SSN, date of birth, and the starting and ending dates of coverage for each covered individual. For each line, a date of birth is reported in column C only if an SSN isnt entered in column B.",
"type": "NarrativeText"
},
{
"element_id": "794f7062cf3f56f2c7d70702bd3d13e1",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 3
},
"text": "Page 2",
"type": "Title"
},
{
"element_id": "3111061d9618db9e89daa1caff9edbc5",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 3
},
"text": "If advance credit payments are made, the only individuals listed on Form 1095-A will be those whom you certified to the Marketplace would be in your tax family for the year of coverage (yourself, spouse, and dependents). If you certified to the Marketplace at enrollment that one or more of the individuals who enrolled in the plan arent individuals who would be in your tax family for the year of coverage, those individuals wont be listed on your Form 1095-A. For example, if you indicated to the Marketplace at enrollment that an individual enrolling in the policy is your adult child who will not be your dependent for the year of coverage, that child will receive a separate Form 1095-A and wont be listed in Part II on your Form 1095-A.",
"type": "NarrativeText"
},
{
"element_id": "a1725ddb727fe5e14e39ac1d53a7a5ea",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 3
},
"text": "If advance credit payments are made and you certify that one or more enrolled individuals arent individuals who would be in your tax family for the year of coverage, your Form 1095-A will include coverage information in Part III that is applicable solely to the individuals listed on your Form 1095-A, and separately issued Forms 1095-A will include coverage information, including dollar amounts, applicable to those individuals not in your tax family.",
"type": "NarrativeText"
},
{
"element_id": "55320ec59d692366de355b44b22cfe70",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 3
},
"text": "If advance credit payments werent made and you didnt identify at enrollment the individuals who would be in your tax family for the year of coverage, Form 1095-A will list all enrolled individuals in Part II on your Form 1095-A.",
"type": "NarrativeText"
},
{
"element_id": "590dd32e83354b093212f7a899785c63",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 3
},
"text": "If there are more than five individuals covered by a policy, you will",
"type": "NarrativeText"
},
{
"element_id": "7f67dd09bd0e1d98bd178637f2275674",
"metadata": {
"data_source": {
"date_created": "2024-02-15 16:06:55.506805",
"date_modified": "2024-02-14 10:43:11.300753",
"permissions_data": [
{
"mode": 33188
}
],
"url": "/Users/tenzo/projects/unstructured.io/unstructured/test_unstructured_ingest/mini-holistic-all/src/IRS-2023-Form-1095-A.pdf"
},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 3
},
"text": "receive one or more additional Forms 1095-A that continue Part II. Part III. Coverage Information, lines 2133. Part III reports information about your insurance coverage that you will need to complete Form 8962 to reconcile advance credit payments or to take the premium tax credit when you file your return. Column A. This column is the monthly premiums for the plan in which you or family members were enrolled, including premiums that you paid and premiums that were paid through advance payments of the premium tax credit. If you or a family member enrolled in a separate dental plan with pediatric benefits, this column includes the portion of the dental plan premiums for the pediatric benefits. If your plan covered benefits that arent essential health benefits, such as adult dental or vision benefits, the amount in this column will be reduced by the premiums for the nonessential benefits. If the policy was terminated by your insurance company due to nonpayment of premiums for 1 or more months, then a -0- will appear in this column for these months regardless of whether advance credit payments were made for these months. See the instructions for Form 8962, Part II, on how to complete Form 8962 if -0- is reported for 1 or more months. Column B. This column is the monthly premium for the second lowest cost silver plan (SLCSP) that the Marketplace has determined applies to members of your family enrolled in the coverage. The applicable SLCSP premium is used to compute your monthly advance credit payments and the premium tax credit you take on your return. See the instructions for Form 8962, Part II, on how to use the information in this column or how to complete Form 8962 if there is no information entered, the information is incorrect, or the information is reported as -0-. If the policy was terminated by your insurance company due to nonpayment of premiums for 1 or more months, then a -0- will appear in this column for the months, regardless of whether advance credit payments were made for these months. Column C. This column is the monthly amount of advance credit payments that were made to your insurance company on your behalf to pay for all or part of the premiums for your coverage. If this is the only column in Part III that is filled in with an amount other than zero for a month, it means your policy was terminated by your insurance company due to nonpayment of premiums, and you arent entitled to take the premium tax credit for that month when you file your tax return. You must still reconcile the entire advance payment that was paid on your behalf for that month using Form 8962. No information will be entered in this column if no advance credit payments were made. Lines 2133. The Marketplace will report the amounts in columns A, B, and C on lines 2132 for each month and enter the totals on line 33. Use this information to complete Form 8962, line 11 or lines 1223.",
"type": "NarrativeText"
}
]

View File

@ -7,6 +7,7 @@ import pytest
from unstructured.metrics.evaluate import ( from unstructured.metrics.evaluate import (
measure_element_type_accuracy, measure_element_type_accuracy,
measure_table_structure_accuracy,
measure_text_extraction_accuracy, measure_text_extraction_accuracy,
) )
@ -20,7 +21,9 @@ TESTING_FILE_DIR = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_evaluate_files")
UNSTRUCTURED_OUTPUT_DIRNAME = "unstructured_output" UNSTRUCTURED_OUTPUT_DIRNAME = "unstructured_output"
GOLD_CCT_DIRNAME = "gold_standard_cct" GOLD_CCT_DIRNAME = "gold_standard_cct"
GOLD_ELEMENT_TYPE_DIRNAME = "gold_standard_element_type" GOLD_ELEMENT_TYPE_DIRNAME = "gold_standard_element_type"
GOLD_TABLE_STRUCTURE_DIRNAME = "gold_standard_table_structure"
UNSTRUCTURED_CCT_DIRNAME = "unstructured_output_cct" UNSTRUCTURED_CCT_DIRNAME = "unstructured_output_cct"
UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME = "unstructured_output_table_structure"
@pytest.fixture() @pytest.fixture()
@ -84,6 +87,23 @@ def test_element_type_evaluation():
assert df.iloc[0].filename == "IRS-form-1987.pdf" assert df.iloc[0].filename == "IRS-form-1987.pdf"
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_table_structure_evaluation():
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME)
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_TABLE_STRUCTURE_DIRNAME)
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_table_structure")
measure_table_structure_accuracy(
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
)
assert os.path.isfile(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"))
assert os.path.isfile(os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv"))
df = pd.read_csv(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"), sep="\t")
assert len(df) == 1
assert len(df.columns) == 9
assert df.iloc[0].filename == "IRS-2023-Form-1095-A.pdf"
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test") @pytest.mark.usefixtures("_cleanup_after_test")
def test_text_extraction_takes_list(): def test_text_extraction_takes_list():

View File

@ -16,6 +16,8 @@ if [ "$EVAL_NAME" == "text-extraction" ]; then
METRIC_STRATEGY="measure-text-extraction-accuracy-command" METRIC_STRATEGY="measure-text-extraction-accuracy-command"
elif [ "$EVAL_NAME" == "element-type" ]; then elif [ "$EVAL_NAME" == "element-type" ]; then
METRIC_STRATEGY="measure-element-type-accuracy-command" METRIC_STRATEGY="measure-element-type-accuracy-command"
elif [ "$EVAL_NAME" == "table-structure" ]; then
METRIC_STRATEGY="measure-table-structure-accuracy-command"
else else
echo "Wrong metric evaluation strategy given. Expected one of [ text-extraction, element-type ]. Got [ $EVAL_NAME ]." echo "Wrong metric evaluation strategy given. Expected one of [ text-extraction, element-type ]. Got [ $EVAL_NAME ]."
exit 1 exit 1

View File

@ -6,6 +6,7 @@ import click
from unstructured.metrics.evaluate import ( from unstructured.metrics.evaluate import (
measure_element_type_accuracy, measure_element_type_accuracy,
measure_table_structure_accuracy,
measure_text_extraction_accuracy, measure_text_extraction_accuracy,
) )
@ -130,5 +131,58 @@ def measure_element_type_accuracy_command(
) )
@main.command()
@click.option("--output_dir", type=str, help="Directory to structured output.")
@click.option("--source_dir", type=str, help="Directory to structured source.")
@click.option(
"--output_list",
type=str,
multiple=True,
help="Optional: list of selected structured output file names under the \
directory to be evaluate. If none, all files under directory will be used.",
)
@click.option(
"--source_list",
type=str,
multiple=True,
help="Optional: list of selected source file names under the directory \
to be evaluate. If none, all files under directory will be used.",
)
@click.option(
"--export_dir",
type=str,
default="metrics",
help="Directory to save the output evaluation metrics to. Default to \
your/working/dir/metrics/",
)
@click.option(
"--visualize",
is_flag=True,
show_default=True,
default=False,
help="Add the flag to show progress bar.",
)
@click.option(
"--cutoff",
type=float,
show_default=True,
default=0.8,
help="The cutoff value for the element level alignment. \
If not set, a default value is used",
)
def measure_table_structure_accuracy_command(
output_dir: str,
source_dir: str,
export_dir: str,
visualize: bool,
output_list: Optional[List[str]] = None,
source_list: Optional[List[str]] = None,
cutoff: Optional[float] = None,
):
return measure_table_structure_accuracy(
output_dir, source_dir, output_list, source_list, export_dir, visualize, cutoff
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -3,6 +3,7 @@
import logging import logging
import os import os
import sys import sys
from pathlib import Path
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
import pandas as pd import pandas as pd
@ -12,6 +13,7 @@ from unstructured.metrics.element_type import (
calculate_element_type_percent_match, calculate_element_type_percent_match,
get_element_type_frequency, get_element_type_frequency,
) )
from unstructured.metrics.table.table_eval import TableEvalProcessor
from unstructured.metrics.text_extraction import calculate_accuracy, calculate_percent_missing_text from unstructured.metrics.text_extraction import calculate_accuracy, calculate_percent_missing_text
from unstructured.metrics.utils import ( from unstructured.metrics.utils import (
_display, _display,
@ -37,7 +39,6 @@ if "eval_log_handler" not in [h.name for h in logger.handlers]:
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
agg_headers = ["metric", "average", "sample_sd", "population_sd", "count"] agg_headers = ["metric", "average", "sample_sd", "population_sd", "count"]
@ -187,3 +188,116 @@ def measure_element_type_accuracy(
_write_to_file(export_dir, "all-docs-element-type-frequency.tsv", df) _write_to_file(export_dir, "all-docs-element-type-frequency.tsv", df)
_write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_df) _write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_df)
_display(agg_df) _display(agg_df)
def measure_table_structure_accuracy(
output_dir: str,
source_dir: str,
output_list: Optional[List[str]] = None,
source_list: Optional[List[str]] = None,
export_dir: str = "metrics",
visualize: bool = False,
cutoff: Optional[float] = None,
):
"""
Loops through the list of structured output from all of `output_dir` or selected files from
`output_list`, and compare with gold-standard of the same file name under `source_dir` or
selected files from `source_list`. Supports also a json file with filenames as keys and
structured gold-standard output as values.
Calculates:
- table found accuracy
- element in column index accuracy
- element in row index accuracy
- element's column content accuracy
- element's row content accuracy
After looped through the whole list, write to tsv. Also calculates the aggregated accuracy.
"""
if not output_list:
output_list = _listdir_recursive(output_dir)
if not source_list:
source_list = _listdir_recursive(source_dir)
rows = []
for doc in tqdm(output_list, leave=False, disable=not visualize): # type: ignore
doc_path = Path(doc)
out_filename = doc_path.stem
doctype = Path(out_filename).suffix
src_gt_filename = out_filename + ".json"
connector = doc_path.parts[-2] if len(doc_path.parts) > 1 else None
if src_gt_filename in source_list: # type: ignore
prediction_file = Path(output_dir) / doc
if not prediction_file.exists():
logger.warning(f"Prediction file {prediction_file} does not exist, skipping")
continue
ground_truth_file = Path(source_dir) / src_gt_filename
if not ground_truth_file.exists():
logger.warning(f"Ground truth file {ground_truth_file} does not exist, skipping")
continue
processor = TableEvalProcessor.from_json_files(
prediction_file=prediction_file,
ground_truth_file=ground_truth_file,
cutoff=cutoff,
)
report = processor.process_file()
rows.append(
[
out_filename,
doctype,
connector,
report.total_tables,
report.table_level_acc,
report.element_col_level_index_acc,
report.element_row_level_index_acc,
report.element_col_level_content_acc,
report.element_row_level_content_acc,
]
)
headers = [
"filename",
"doctype",
"connector",
"total_tables",
"table_level_acc",
"element_col_level_index_acc",
"element_row_level_index_acc",
"element_col_level_content_acc",
"element_row_level_content_acc",
]
df = pd.DataFrame(rows, columns=headers)
if df.empty:
agg_df = pd.DataFrame(
[
["total_tables", None, None, None, 0],
["table_level_acc", None, None, None, 0],
["element_col_level_index_acc", None, None, None, 0],
["element_row_level_index_acc", None, None, None, 0],
["element_col_level_content_acc", None, None, None, 0],
["element_row_level_content_acc", None, None, None, 0],
]
).transpose()
else:
# filter out documents with no tables
having_table_df = df[df["total_tables"] > 0]
# compute aggregated metrics for tables
agg_df = having_table_df.agg(
{
"total_tables": [_mean, _stdev, _pstdev, "count"],
"table_level_acc": [_mean, _stdev, _pstdev, "count"],
"element_col_level_index_acc": [_mean, _stdev, _pstdev, "count"],
"element_row_level_index_acc": [_mean, _stdev, _pstdev, "count"],
"element_col_level_content_acc": [_mean, _stdev, _pstdev, "count"],
"element_row_level_content_acc": [_mean, _stdev, _pstdev, "count"],
}
).transpose()
agg_df = agg_df.reset_index()
agg_df.columns = agg_headers
_write_to_file(export_dir, "all-docs-table-structure-accuracy.tsv", df)
_write_to_file(export_dir, "aggregate-table-structure-accuracy.tsv", agg_df)
_display(agg_df)

View File

View File

@ -0,0 +1,143 @@
import difflib
from typing import Any, Dict, List
import numpy as np
import pandas as pd
from unstructured_inference.models.eval import compare_contents_as_df
class TableAlignment:
def __init__(self, cutoff: float = 0.8):
self.cutoff = cutoff
@staticmethod
def get_content_in_tables(table_data: List[List[Dict[str, Any]]]) -> List[str]:
# Replace below docstring with google-style docstring
"""Extracts and concatenates the content of cells from each table in a list of tables.
Args:
table_data: A list of tables, each table being a list of cell data dictionaries.
Returns:
List of strings where each string represents the concatenated content of one table.
"""
return [" ".join([d["content"] for d in td if "content" in d]) for td in table_data]
@staticmethod
def get_table_level_alignment(
predicted_table_data: List[List[Dict[str, Any]]],
ground_truth_table_data: List[List[Dict[str, Any]]],
) -> List[int]:
"""Compares predicted table data with ground truth data to find the best
matching table index for each predicted table.
Args:
predicted_table_data: A list of predicted tables.
ground_truth_table_data: A list of ground truth tables.
Returns:
A list of indices indicating the best match in the ground truth for
each predicted table.
"""
ground_truth_texts = TableAlignment.get_content_in_tables(ground_truth_table_data)
matched_indices = []
for td in predicted_table_data:
reference = TableAlignment.get_content_in_tables([td])[0]
matches = difflib.get_close_matches(reference, ground_truth_texts, cutoff=0.1, n=1)
matched_indices.append(ground_truth_texts.index(matches[0]) if matches else -1)
return matched_indices
@staticmethod
def _zip_to_dataframe(table_data: List[Dict[str, Any]]) -> pd.DataFrame:
df = pd.DataFrame(table_data).pivot(
index="row_index",
columns="col_index",
values="content",
)
return df
@staticmethod
def get_element_level_alignment(
predicted_table_data: List[List[Dict[str, Any]]],
ground_truth_table_data: List[List[Dict[str, Any]]],
matched_indices: List[int],
cutoff: float = 0.8,
) -> Dict[str, float]:
"""Aligns elements of the predicted tables with the ground truth tables at the cell level.
Args:
predicted_table_data: A list of predicted tables.
ground_truth_table_data: A list of ground truth tables.
matched_indices: Indices of the best matching ground truth table for each predicted table.
cutoff: The cutoff value for the close matches.
Returns:
A dictionary with column and row alignment accuracies.
"""
aligned_element_col_count = 0
aligned_element_row_count = 0
total_element_count = 0
content_diff_cols = []
content_diff_rows = []
for idx, td in zip(matched_indices, predicted_table_data):
if idx == -1:
continue
ground_truth_td = ground_truth_table_data[idx]
# Get row and col content accuracy
predict_table_df = TableAlignment._zip_to_dataframe(td)
ground_truth_table_df = TableAlignment._zip_to_dataframe(ground_truth_td)
table_content_diff = compare_contents_as_df(
ground_truth_table_df.fillna(""),
predict_table_df.fillna(""),
)
content_diff_cols.append(table_content_diff["by_col_token_ratio"])
content_diff_rows.append(table_content_diff["by_row_token_ratio"])
# Get row and col index accuracy
ground_truth_td_contents_list = [gtd["content"].lower() for gtd in ground_truth_td]
indices_tuple_pairs = []
for td_ele in td:
content = td_ele["content"].lower()
row_index = td_ele["row_index"]
col_idx = td_ele["col_index"]
matches = difflib.get_close_matches(
content,
ground_truth_td_contents_list,
cutoff=cutoff,
n=1,
)
matched_idx = ground_truth_td_contents_list.index(matches[0]) if matches else -1
if matched_idx >= 0:
gt_row_index = ground_truth_td[matched_idx]["row_index"]
gt_col_index = ground_truth_td[matched_idx]["col_index"]
indices_tuple_pairs.append(((row_index, col_idx), (gt_row_index, gt_col_index)))
for indices_tuple_pair in indices_tuple_pairs:
if indices_tuple_pair[0][0] == indices_tuple_pair[1][0]:
aligned_element_row_count += 1
if indices_tuple_pair[0][1] == indices_tuple_pair[1][1]:
aligned_element_col_count += 1
total_element_count += 1
if total_element_count > 0:
col_index_acc = round(aligned_element_col_count / total_element_count, 2)
row_index_acc = round(aligned_element_row_count / total_element_count, 2)
col_content_acc = round(np.mean(content_diff_cols) / 100.0, 2)
row_content_acc = round(np.mean(content_diff_rows) / 100.0, 2)
return {
"col_index_acc": col_index_acc,
"row_index_acc": row_index_acc,
"col_content_acc": col_content_acc,
"row_content_acc": row_content_acc,
}
return {}

View File

@ -0,0 +1,224 @@
"""
The purpose of this script is to create a comprehensive metric for table evaluation
1. Verify table identification.
a. Concatenate all text in the table and ground truth.
b. Calculate the difference to find the closest matches.
c. If contents are too different, mark as a failure.
2. For each identified table:
a. Align elements at the level of individual elements.
b. Match elements by text.
c. Determine indexes for both predicted and actual data.
d. Compare index tuples at column and row levels to assess content shifts.
e. Compare the token orders by flattened along column and row levels
f. Note: Imperfect HTML is acceptable unless it impedes parsing,
in which case the table is considered failed.
Example
python table_eval.py \
--prediction_file "model_output.pdf.json" \
--ground_truth_file "ground_truth.pdf.json"
"""
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional
import click
import numpy as np
from unstructured.metrics.table.table_alignment import TableAlignment
from unstructured.metrics.table.table_extraction import (
extract_and_convert_tables_from_ground_truth,
extract_and_convert_tables_from_prediction,
)
@dataclass
class TableEvaluation:
"""Class representing a gathered table metrics."""
total_tables: int
table_level_acc: float
element_col_level_index_acc: float
element_row_level_index_acc: float
element_col_level_content_acc: float
element_row_level_content_acc: float
def _count_predicted_tables(matched_indices: List[int]) -> int:
"""Counts the number of predicted tables that have a corresponding match in the ground truth.
Args:
matched_indices: List of indices indicating matches between predicted
and ground truth tables.
Returns:
The count of matched predicted tables.
"""
return sum(1 for idx in matched_indices if idx >= 0)
class TableEvalProcessor:
def __init__(
self,
prediction: List[Dict[str, Any]],
ground_truth: List[Dict[str, Any]],
cutoff: float = 0.8,
):
"""
Initializes the TableEvalProcessor prediction and ground truth.
Args:
prediction: Predicted table data.
ground_truth: Ground truth table data. The tables text should be in the deckerd format.
cutoff: The cutoff value for the element level alignment. Default is 0.8.
Examples:
ground_truth: [
{
"type": "Table",
"text": [
{
"id": "f4c35dae-105b-46f5-a77a-7fbc199d6aca",
"x": 0,
"y": 0,
"w": 1,
"h": 1,
"content": "Cell text"
},
...
}
]
prediction: [
{
"element_id": <id_string>,
...
"metadata": {
...
"text_as_html": "<table><thead><th rowspan=\"2\">June....</tr></td></table>"
},
}
]
"""
self.prediction = prediction
self.ground_truth = ground_truth
self.cutoff = cutoff
@classmethod
def from_json_files(
cls,
prediction_file: Path,
ground_truth_file: Path,
cutoff: Optional[float] = None,
) -> "TableEvalProcessor":
"""Factory classmethod to initialize the object with path to json files instead of dicts
Args:
prediction_file: Path to the json file containing the predicted table data.
ground_truth_file: Path to the json file containing the ground truth table data.
cutoff: The cutoff value for the element level alignment.
If not set, class default value is used (=0.8).
Returns:
TableEvalProcessor: An instance of the class initialized with the provided data.
"""
with open(prediction_file) as f:
prediction = json.load(f)
with open(ground_truth_file) as f:
ground_truth = json.load(f)
if cutoff is not None:
return cls(prediction=prediction, ground_truth=ground_truth, cutoff=cutoff)
else:
return cls(prediction=prediction, ground_truth=ground_truth)
def process_file(self) -> TableEvaluation:
"""Processes the files and computes table-level and element-level accuracy.
Returns:
TableEvaluation: A dataclass object containing the computed metrics.
"""
total_predicted_tables = 0
total_tables = 0
total_row_index_acc = []
total_col_index_acc = []
total_row_content_acc = []
total_col_content_acc = []
predicted_table_data = extract_and_convert_tables_from_prediction(
self.prediction,
)
ground_truth_table_data = extract_and_convert_tables_from_ground_truth(
self.ground_truth,
)
matched_indices = TableAlignment.get_table_level_alignment(
predicted_table_data,
ground_truth_table_data,
)
total_predicted_tables += _count_predicted_tables(matched_indices)
total_tables += len(ground_truth_table_data)
metrics = TableAlignment.get_element_level_alignment(
predicted_table_data,
ground_truth_table_data,
matched_indices,
cutoff=self.cutoff,
)
if metrics:
total_col_index_acc.append(metrics["col_index_acc"])
total_row_index_acc.append(metrics["row_index_acc"])
total_col_content_acc.append(metrics["col_content_acc"])
total_row_content_acc.append(metrics["row_content_acc"])
return TableEvaluation(
total_tables=total_tables,
table_level_acc=(
round(total_predicted_tables / total_tables, 2) if total_tables else -1.0
),
element_col_level_index_acc=(
round(np.mean(total_col_index_acc), 2) if len(total_col_index_acc) > 0 else -1.0
),
element_row_level_index_acc=(
round(np.mean(total_row_index_acc), 2) if len(total_row_index_acc) > 0 else -1.0
),
element_col_level_content_acc=(
round(np.mean(total_col_content_acc), 2) if len(total_col_content_acc) > 0 else -1.0
),
element_row_level_content_acc=(
round(np.mean(total_row_content_acc), 2) if len(total_row_content_acc) > 0 else -1.0
),
)
@click.command()
@click.option(
"--prediction_file", help="Path to the model prediction JSON file", type=click.Path(exists=True)
)
@click.option(
"--ground_truth_file", help="Path to the ground truth JSON file", type=click.Path(exists=True)
)
@click.option(
"--cutoff",
type=float,
show_default=True,
default=0.8,
help="The cutoff value for the element level alignment. \
If not set, a default value is used",
)
def run(prediction_file: str, ground_truth_file: str, cutoff: Optional[float]):
"""Runs the table evaluation process and prints the computed metrics."""
processor = TableEvalProcessor.from_json_files(
Path(prediction_file),
Path(ground_truth_file),
cutoff=cutoff,
)
report = processor.process_file()
print(report)
if __name__ == "__main__":
run()

View File

@ -0,0 +1,129 @@
from typing import Any, Dict, List
from bs4 import BeautifulSoup
EMPTY_CELL = {
"row_index": "",
"col_index": "",
"content": "",
}
def _convert_table_from_html(content: str) -> List[Dict[str, Any]]:
"""Convert html format to table structure.
Args:
content: The html content with a table to extract.
Returns:
A list of dictionaries where each dictionary represents a cell in the table.
"""
soup = BeautifulSoup(content, "html.parser")
table = soup.find("table")
rows = table.findAll(["tr", "thead"])
table_data = []
for i, row in enumerate(rows):
headers = row.findAll("th")
data_row = row.findAll("td")
if headers:
for j, header in enumerate(headers):
cell = {
"row_index": i,
"col_index": j,
"content": header.text,
}
table_data.append(cell)
if data_row:
for k, data in enumerate(data_row):
cell = {
"row_index": i,
"col_index": k,
"content": data.text,
}
table_data.append(cell)
return table_data
def _convert_table_from_deckerd(content: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Convert deckerd format to table structure.
Args:
content: The deckerd formatted content with a table to extract.
Returns:
A list of dictionaries where each dictionary represents a cell in the table.
"""
table_data = []
for table in content:
try:
cell_data = {
"row_index": table["y"],
"col_index": table["x"],
"content": table["content"],
}
except KeyError:
cell_data = EMPTY_CELL
except TypeError:
cell_data = EMPTY_CELL
table_data.append(cell_data)
return table_data
def extract_and_convert_tables_from_ground_truth(
file_elements: List[Dict[str, Any]],
) -> List[List[Dict[str, Any]]]:
"""Extracts and converts tables data to a structured format based on the specified table type.
Args:
file_elements: List of elements from the ground truth file.
Returns:
A list of tables with each table represented as a list of cell data dictionaries.
"""
ground_truth_table_data = []
for element in file_elements:
if "type" in element and element["type"] == "Table" and "text" in element:
try:
converted_data = _convert_table_from_deckerd(
element["text"],
)
ground_truth_table_data.append(converted_data)
except Exception as e:
print(f"Error converting ground truth data: {e}")
ground_truth_table_data.append({})
return ground_truth_table_data
def extract_and_convert_tables_from_prediction(
file_elements: List[Dict[str, Any]],
) -> List[List[Dict[str, Any]]]:
"""Extracts and converts table data to a structured format based on the specified table type.
Args:
file_elements: List of elements from the file.
table_type: The type of table format.
Returns:
A list of tables with each table represented as a list of cell data dictionaries.
"""
predicted_table_data = []
for element in file_elements:
if element.get("type") == "Table":
val = element["metadata"].get("text_as_html")
if not val or "<table>" not in val:
continue
try:
converted_data = _convert_table_from_html(val)
predicted_table_data.append(converted_data)
except Exception as e:
print(f"Error converting Unstructured table data: {e}")
predicted_table_data.append({})
return predicted_table_data