diff --git a/CHANGELOG.md b/CHANGELOG.md index bd54ff4a4..c0ac7f377 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,14 @@ -## 0.16.19-dev2 +## 0.16.19-dev3 ### Enhancements ### Features ### Fixes -- **fix a bug where table extraction is skipped when it shouldn't**. Pages with just one table as its content or starts with a table misses table extraction. The routing logic is now fixed. +- **Fix a bug where table extraction is skipped when it shouldn't**. Pages with just one table as its content or starts with a table misses table extraction. The routing logic is now fixed. - **Correct deprecated `ruff` invocation in `make tidy`**. This will future-proof it or avoid surprises if someone happens to upgrade Ruff. - **Remove upper bound constraint on python version** in setup.py. Python3.13 is not yet officially supported, but allow users to try. +- **Fixes removing HTML elements from the inside of table cells** in html partition v=2.0. The HTML partitioner now correctly preserves HTML elements from the inside of table cells. ## 0.16.17 diff --git a/test_unstructured/documents/unstructured_json_output/example.json b/test_unstructured/documents/unstructured_json_output/example.json index f8997df5e..eb1c71f24 100644 --- a/test_unstructured/documents/unstructured_json_output/example.json +++ b/test_unstructured/documents/unstructured_json_output/example.json @@ -4,6 +4,10 @@ "metadata": { "category_depth": 0, "filename": "example.pdf", + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "897a8a47377c4ad6aab839a929879537", "text_as_html": "
" @@ -16,6 +20,10 @@ "metadata": { "category_depth": 1, "filename": "example.pdf", + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "3a6b156a81764e17be128264241f8136", "text_as_html": "
" @@ -28,9 +36,13 @@ "metadata": { "category_depth": 2, "filename": "example.pdf", + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "45b3d0053468484ba1c7b53998115412", - "text_as_html": "

Header

" + "text_as_html": "

Header

" }, "text": "Header", "type": "Title" @@ -40,9 +52,13 @@ "metadata": { "category_depth": 2, "filename": "example.pdf", + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "45b3d0053468484ba1c7b53998115412", - "text_as_html": "" + "text_as_html": "" }, "text": "Date: October 30, 2023", "type": "UncategorizedText" @@ -52,9 +68,13 @@ "metadata": { "category_depth": 1, "filename": "example.pdf", + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "3a6b156a81764e17be128264241f8136", - "text_as_html": "
" + "text_as_html": "
" }, "text": "From field name Example value", "type": "UncategorizedText" @@ -64,6 +84,10 @@ "metadata": { "category_depth": 1, "filename": "example.pdf", + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "3a6b156a81764e17be128264241f8136", "text_as_html": "
" @@ -76,9 +100,13 @@ "metadata": { "category_depth": 2, "filename": "example.pdf", + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "592422373ed741b68a077e2003f8ed81", - "text_as_html": "
DescriptionRow header
Value description50 $ (1.32 %)
" + "text_as_html": "
DescriptionRow header
Value description50 $(1.32 %)
" }, "text": "Description Row header Value description 50 $ (1.32 %)", "type": "Table" @@ -88,6 +116,10 @@ "metadata": { "category_depth": 1, "filename": "example.pdf", + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "3a6b156a81764e17be128264241f8136", "text_as_html": "
" @@ -100,9 +132,13 @@ "metadata": { "category_depth": 2, "filename": "example.pdf", + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "1032242af75c4b37984ea7fea9aac74c", - "text_as_html": "

2. Subtitle

" + "text_as_html": "

2. Subtitle

" }, "text": "2. Subtitle", "type": "Title" @@ -112,9 +148,13 @@ "metadata": { "category_depth": 2, "filename": "example.pdf", + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "1032242af75c4b37984ea7fea9aac74c", - "text_as_html": "

Paragraph text

" + "text_as_html": "

Paragraph text

" }, "text": "Paragraph text", "type": "NarrativeText" diff --git a/test_unstructured/documents/unstructured_json_output/example_full_doc.json b/test_unstructured/documents/unstructured_json_output/example_full_doc.json index 78049dba3..42e2ee9c6 100644 --- a/test_unstructured/documents/unstructured_json_output/example_full_doc.json +++ b/test_unstructured/documents/unstructured_json_output/example_full_doc.json @@ -3,6 +3,10 @@ "element_id": "630907012e0442ab8f9bf97a8d1fa8a0", "metadata": { "category_depth": 0, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "a77ae2bba17845d6bcce44f6aebadfb5", "text_as_html": "
" @@ -14,6 +18,10 @@ "element_id": "d0a9edd181f542f0ba695489f14c4b75", "metadata": { "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", "text_as_html": "
" @@ -25,9 +33,13 @@ "element_id": "2a8866a868414163afee2ef24574fc9b", "metadata": { "category_depth": 2, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "d0a9edd181f542f0ba695489f14c4b75", - "text_as_html": "Table of Contents https://www.sec.gov/Archives/edgar/data/1061219/000106121923000017/form10q.htm " + "text_as_html": "Table of Contentshttps://www.sec.gov/Archives/edgar/data/1061219/000106121923000017/form10q.htm" }, "text": "Table of Contents 11/7/23, 2:38 PM https://www.sec.gov/Archives/edgar/data/1061219/000106121923000017/form10q.htm", "type": "UncategorizedText" @@ -36,9 +48,13 @@ "element_id": "2bca4006451a405c87ebaf6eb9ff7bd9", "metadata": { "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", - "text_as_html": "

ENTERPRISE PRODUCTS PARTNERS L.P.

" + "text_as_html": "

ENTERPRISE PRODUCTS PARTNERS L.P.

" }, "text": "ENTERPRISE PRODUCTS PARTNERS L.P.", "type": "Title" @@ -47,9 +63,13 @@ "element_id": "8da7d91b8f094acfb4caef69d96d17b9", "metadata": { "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", - "text_as_html": "

NOTES TO UNAUDITED CONDENSED CONSOLIDATED FINANCIAL STATEMENTS

" + "text_as_html": "

NOTES TO UNAUDITED CONDENSED CONSOLIDATED FINANCIAL STATEMENTS

" }, "text": "NOTES TO UNAUDITED CONDENSED CONSOLIDATED FINANCIAL STATEMENTS", "type": "Title" @@ -58,9 +78,13 @@ "element_id": "1a8af2164abc4fed820445c7d7a1652e", "metadata": { "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", - "text_as_html": "

Note 6. Intangible Assets and Goodwill

" + "text_as_html": "

Note 6. Intangible Assets and Goodwill

" }, "text": "Note 6. Intangible Assets and Goodwill", "type": "Title" @@ -69,9 +93,13 @@ "element_id": "c9dcb08578704efeb997f7d3dd659a61", "metadata": { "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", - "text_as_html": "

Identifiable Intangible Assets

" + "text_as_html": "

Identifiable Intangible Assets

" }, "text": "Identifiable Intangible Assets", "type": "Title" @@ -80,9 +108,13 @@ "element_id": "2189d06e7f1f4d73b93c3f1845486b52", "metadata": { "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", - "text_as_html": "

The following table summarizes our intangible assets by business segment at the dates indicated:

" + "text_as_html": "

The following table summarizes our intangible assets by business segment at the dates indicated:

" }, "text": "The following table summarizes our intangible assets by business segment at the dates indicated:", "type": "NarrativeText" @@ -91,9 +123,13 @@ "element_id": "d0f9bd2adefa42e18357960d582588bd", "metadata": { "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", - "text_as_html": "
June 30, 2023December 31, 2022
Gross ValueAccumulated AmortizationCarrying ValueGross ValueAccumulated AmortizationCarrying Value
NGL Pipelines & Services:
Customer relationship intangibles$ 449$ (257)$ 192$ 449$ (249)$ 200
Contract-based intangibles751(95)656749(84)665
Segment total1,200(352)8481,198(333)865
Crude Oil Pipelines & Services:
Customer relationship intangibles2,195(477)1,7182,195(431)1,764
Contract-based intangibles283(273)10283(271)12
Segment total2,478(750)1,7282,478(702)1,776
Natural Gas Pipelines & Services:
Customer relationship intangibles1,350(607)7431,350(588)762
Contract-based intangibles639(201)438639(195)444
Segment total1,989(808)1,1811,989(783)1,206
Petrochemical & Refined Products Services:
Customer relationship intangibles181(83)98181(80)101
Contract-based intangibles45(29)1645(28)17
Segment total226(112)114226(108)118
Total intangible assets$ 5,893$ (2,022)$ 3,871$ 5,891$ (1,926)$ 3,965
" + "text_as_html": "
June 30, 2023December 31, 2022
Gross ValueAccumulated AmortizationCarrying ValueGross ValueAccumulated AmortizationCarrying Value
NGL Pipelines & Services:
Customer relationship intangibles$

449

$

(257)

$

192

$

449

$

(249)

$

200

Contract-based intangibles751(95)656749(84)665
Segment total1,200(352)8481,198(333)865
Crude Oil Pipelines & Services:
Customer relationship intangibles2,195(477)1,7182,195(431)1,764
Contract-based intangibles283(273)10283(271)12
Segment total2,478(750)1,7282,478(702)1,776
Natural Gas Pipelines & Services:
Customer relationship intangibles1,350(607)7431,350(588)762
Contract-based intangibles639(201)438639(195)444
Segment total1,989(808)1,1811,989(783)1,206
Petrochemical & Refined Products Services:
Customer relationship intangibles181(83)98181(80)101
Contract-based intangibles45(29)1645(28)17
Segment total226(112)114226(108)118
Total intangible assets$

5,893

$

(2,022)

$

3,871

$

5,891

$

(1,926)

$

3,965

" }, "text": "June 30, 2023 December 31, 2022 Gross Value Accumulated Amortization Carrying Value Gross Value Accumulated Amortization Carrying Value NGL Pipelines & Services: Customer relationship intangibles $ 449 $ (257) $ 192 $ 449 $ (249) $ 200 Contract-based intangibles 751 (95) 656 749 (84) 665 Segment total 1,200 (352) 848 1,198 (333) 865 Crude Oil Pipelines & Services: Customer relationship intangibles 2,195 (477) 1,718 2,195 (431) 1,764 Contract-based intangibles 283 (273) 10 283 (271) 12 Segment total 2,478 (750) 1,728 2,478 (702) 1,776 Natural Gas Pipelines & Services: Customer relationship intangibles 1,350 (607) 743 1,350 (588) 762 Contract-based intangibles 639 (201) 438 639 (195) 444 Segment total 1,989 (808) 1,181 1,989 (783) 1,206 Petrochemical & Refined Products Services: Customer relationship intangibles 181 (83) 98 181 (80) 101 Contract-based intangibles 45 (29) 16 45 (28) 17 Segment total 226 (112) 114 226 (108) 118 Total intangible assets $ 5,893 $ (2,022) $ 3,871 $ 5,891 $ (1,926) $ 3,965", "type": "Table" @@ -102,9 +138,13 @@ "element_id": "48d7d6313bc141c6945f7f5eee588db8", "metadata": { "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", - "text_as_html": "

The following table presents the amortization expense of our intangible assets by business segment for the periods indicated:

" + "text_as_html": "

The following table presents the amortization expense of our intangible assets by business segment for the periods indicated:

" }, "text": "The following table presents the amortization expense of our intangible assets by business segment for the periods indicated:", "type": "NarrativeText" @@ -113,9 +153,13 @@ "element_id": "d3dbbac8b8834b109421da2cbeda1399", "metadata": { "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", - "text_as_html": "
For the Three Months Ended June 30,For the Six Months Ended June 30,
2023202220232022
NGL Pipelines & Services$ 10$ 9$ 19$ 17
Crude Oil Pipelines & Services25214841
Natural Gas Pipelines & Services13142525
Petrochemical & Refined Products Services2143
Total$ 50$ 45$ 96$ 86
" + "text_as_html": "
For the Three Months Ended June 30,For the Six Months Ended June 30,
2023202220232022
NGL Pipelines & Services$

10

$

9

$

19

$

17

Crude Oil Pipelines & Services25214841
Natural Gas Pipelines & Services13142525
Petrochemical & Refined Products Services2143
Total$

50

$

45

$

96

$

86

" }, "text": "For the Three Months Ended June 30, For the Six Months Ended June 30, 2023 2022 2023 2022 NGL Pipelines & Services $ 10 $ 9 $ 19 $ 17 Crude Oil Pipelines & Services 25 21 48 41 Natural Gas Pipelines & Services 13 14 25 25 Petrochemical & Refined Products Services 2 1 4 3 Total $ 50 $ 45 $ 96 $ 86", "type": "Table" @@ -124,9 +168,13 @@ "element_id": "42c2678b8b5d4977a27e8ba6a8b2224f", "metadata": { "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", - "text_as_html": "

The following table presents our forecast of amortization expense associated with existing intangible assets for the periods indicated:

" + "text_as_html": "

The following table presents our forecast of amortization expense associated with existing intangible assets for the periods indicated:

" }, "text": "The following table presents our forecast of amortization expense associated with existing intangible assets for the periods indicated:", "type": "NarrativeText" @@ -135,9 +183,13 @@ "element_id": "fb8c1e14c5ca44caa7359a4ace457701", "metadata": { "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", - "text_as_html": "
Remainder of 20232024202520262027
$107222230237235
" + "text_as_html": "
Remainder of 20232024202520262027
$107222230237235
" }, "text": "Remainder of 2023 2024 2025 2026 2027 $ 107 222 230 237 235", "type": "Table" @@ -146,9 +198,13 @@ "element_id": "a5c180f735a846929e7d496ac5d49603", "metadata": { "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", - "text_as_html": "

Goodwill

" + "text_as_html": "

Goodwill

" }, "text": "Goodwill", "type": "Title" @@ -157,9 +213,13 @@ "element_id": "f1d9b3bb0fc04cae8e419a394f8c0e45", "metadata": { "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", - "text_as_html": "

Goodwill represents the excess of the purchase price of an acquired business over the amounts assigned to assets acquired and liabilities assumed in the transaction. There has been no change in our goodwill amounts since those reported in our 2022 Form 10-K.

" + "text_as_html": "

Goodwill represents the excess of the purchase price of an acquired business over the amounts assigned to assets acquired and liabilities assumed in the transaction. There has been no change in our goodwill amounts since those reported in our 2022 Form 10-K.

" }, "text": "Goodwill represents the excess of the purchase price of an acquired business over the amounts assigned to assets acquired and liabilities assumed in the transaction. There has been no change in our goodwill amounts since those reported in our 2022 Form 10-K.", "type": "NarrativeText" @@ -168,9 +228,13 @@ "element_id": "9cc42bfd543943fe97a179576d1f0f09", "metadata": { "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", - "text_as_html": "13 " + "text_as_html": "13" }, "text": "13", "type": "PageNumber" @@ -179,6 +243,10 @@ "element_id": "e29d700b4fd046dd82dcd5e9a2e1f5ab", "metadata": { "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], "page_number": 1, "parent_id": "630907012e0442ab8f9bf97a8d1fa8a0", "text_as_html": "