452 lines
17 KiB
Python
Raw Normal View History

rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
"""Partitioner for Excel 2007+ (XLSX) spreadsheets."""
from __future__ import annotations
import io
from tempfile import SpooledTemporaryFile
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
from typing import IO, Any, Iterator, Optional, cast
import networkx as nx
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
import numpy as np
import pandas as pd
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
from lxml.html.soupparser import fromstring as soupparser_fromstring # pyright: ignore
from typing_extensions import Self, TypeAlias
from unstructured.chunking import add_chunking_strategy
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
from unstructured.cleaners.core import clean_bullets
from unstructured.documents.elements import (
Element,
ElementMetadata,
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
ListItem,
NarrativeText,
Table,
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
Text,
Title,
process_metadata,
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
fix(xlsx): xlsx subtable algorithm (#2534) **Reviewers:** It may be easier to review each of the two commits separately. The first adds the new `_SubtableParser` object with its unit-tests and the second one uses that object to replace the flawed existing subtable-parsing algorithm. **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). This PR replaces the flawed existing algorithm with a `_SubtableParser` object that encapsulates all that logic and has thorough unit-tests. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes. This PR fixes all these bugs: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-13 20:29:17 -08:00
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
from unstructured.partition.lang import apply_lang_metadata
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
from unstructured.partition.text_type import (
is_bulleted_text,
is_possible_narrative_text,
is_possible_numbered_list,
is_possible_title,
)
fix(xlsx): xlsx subtable algorithm (#2534) **Reviewers:** It may be easier to review each of the two commits separately. The first adds the new `_SubtableParser` object with its unit-tests and the second one uses that object to replace the flawed existing subtable-parsing algorithm. **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). This PR replaces the flawed existing algorithm with a `_SubtableParser` object that encapsulates all that logic and has thorough unit-tests. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes. This PR fixes all these bugs: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-13 20:29:17 -08:00
from unstructured.utils import lazyproperty
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
_CellCoordinate: TypeAlias = "tuple[int, int]"
DETECTION_ORIGIN: str = "xlsx"
@process_metadata()
@add_metadata_with_filetype(FileType.XLSX)
chore: Table chunking (#1540) This change is adding to our `add_chunking_strategy` logic so that we are able to chunk Table elements' `text` and `text_as_html` params. In order to keep the functionality under the same `by_title` chunking strategy we have renamed the `combine_under_n_chars` to `max_characters`. It functions the same way for the combining elements under Title's, as well as specifying a chunk size (in chars) for TableChunk elements. *renaming the variable to `max_characters` will also reflect the 'hard max' we will implement for large elements in followup PRs Additionally -> some lint changes snuck in when I ran `make tidy` hence the minor changes in unrelated files :) TODO: ✅ add unit tests --> note: added where I could to unit tests! Some unit tests I just clarified that the chunking strategy was now 'by_title' because we don't have a file example that has Table elements to test the 'by_num_characters' chunking strategy ✅ update changelog To manually test: ``` In [1]: filename="example-docs/example-10k.html" In [2]: from unstructured.chunking.title import chunk_table_element In [3]: from unstructured.partition.auto import partition In [4]: elements = partition(filename) # element at -2 happens to be a Table, and we'll get chunks of char size 4 here In [5]: chunks = chunk_table_element(elements[-2], 4) # examine text and text_as_html params ln [6]: for c in chunks: print(c.text) print(c.metadata.text_as_html) ``` --------- Co-authored-by: Yao You <theyaoyou@gmail.com>
2023-10-03 09:40:34 -07:00
@add_chunking_strategy()
def partition_xlsx(
filename: Optional[str] = None,
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
file: Optional[IO[bytes]] = None,
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
infer_table_structure: bool = True,
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
languages: Optional[list[str]] = ["auto"],
detect_language_per_element: bool = False,
metadata_last_modified: Optional[str] = None,
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
include_header: bool = False,
find_subtable: bool = True,
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
**kwargs: Any,
) -> list[Element]:
"""Partitions Microsoft Excel Documents in .xlsx format into its document elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
include_metadata
Determines whether or not metadata is included in the output.
infer_table_structure
If True, any Table elements that are extracted will also have a metadata field
named "text_as_html" where the table's text content is rendered into an html string.
I.e., rows and cells are preserved.
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
languages
User defined value for metadata.languages if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
metadata_last_modified
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
The day of the last modification
include_header
Determines whether or not header info is included in text and medatada.text_as_html
"""
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
last_modification_date = None
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
header = 0 if include_header else None
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
sheets: dict[str, pd.DataFrame] = {}
if filename:
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
sheets = pd.read_excel( # pyright: ignore[reportUnknownMemberType]
filename, sheet_name=None, header=header
)
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
last_modification_date = get_last_modified_date(filename)
elif file:
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
if isinstance(file, SpooledTemporaryFile):
file.seek(0)
f = io.BytesIO(file.read())
else:
f = file
sheets = pd.read_excel( # pyright: ignore[reportUnknownMemberType]
f, sheet_name=None, header=header
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
)
last_modification_date = get_last_modified_date_from_file(file)
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
else:
fix(xlsx): xlsx subtable algorithm (#2534) **Reviewers:** It may be easier to review each of the two commits separately. The first adds the new `_SubtableParser` object with its unit-tests and the second one uses that object to replace the flawed existing subtable-parsing algorithm. **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). This PR replaces the flawed existing algorithm with a `_SubtableParser` object that encapsulates all that logic and has thorough unit-tests. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes. This PR fixes all these bugs: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-13 20:29:17 -08:00
raise ValueError("Either 'filename' or 'file' argument must be specified")
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
elements: list[Element] = []
for page_number, (sheet_name, sheet) in enumerate(sheets.items(), start=1):
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
if not find_subtable:
html_text = (
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
sheet.to_html( # pyright: ignore[reportUnknownMemberType]
index=False, header=include_header, na_rep=""
)
if infer_table_structure
else None
)
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
# XXX: `html_text` can be `None`. What happens on this call in that case?
text = cast(
str,
soupparser_fromstring( # pyright: ignore[reportUnknownMemberType]
html_text
).text_content(),
)
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
if include_metadata:
metadata = ElementMetadata(
text_as_html=html_text,
page_name=sheet_name,
page_number=page_number,
filename=metadata_filename or filename,
last_modified=metadata_last_modified or last_modification_date,
)
metadata.detection_origin = DETECTION_ORIGIN
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
else:
metadata = ElementMetadata()
table = Table(text=text, metadata=metadata)
elements.append(table)
else:
for component in _ConnectedComponents.from_worksheet_df(sheet):
subtable_parser = _SubtableParser(component.subtable)
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
metadata = _get_metadata(
include_metadata,
sheet_name,
page_number,
metadata_filename or filename,
metadata_last_modified or last_modification_date,
)
fix(xlsx): xlsx subtable algorithm (#2534) **Reviewers:** It may be easier to review each of the two commits separately. The first adds the new `_SubtableParser` object with its unit-tests and the second one uses that object to replace the flawed existing subtable-parsing algorithm. **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). This PR replaces the flawed existing algorithm with a `_SubtableParser` object that encapsulates all that logic and has thorough unit-tests. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes. This PR fixes all these bugs: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-13 20:29:17 -08:00
# -- emit each leading single-cell row as its own `Text`-subtype element --
for content in subtable_parser.iter_leading_single_cell_rows_texts():
element = _check_content_element_type(str(content))
element.metadata = metadata
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
elements.append(element)
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
# -- emit core-table (if it exists) as a `Table` element --
fix(xlsx): xlsx subtable algorithm (#2534) **Reviewers:** It may be easier to review each of the two commits separately. The first adds the new `_SubtableParser` object with its unit-tests and the second one uses that object to replace the flawed existing subtable-parsing algorithm. **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). This PR replaces the flawed existing algorithm with a `_SubtableParser` object that encapsulates all that logic and has thorough unit-tests. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes. This PR fixes all these bugs: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-13 20:29:17 -08:00
core_table = subtable_parser.core_table
if core_table is not None:
html_text = core_table.to_html( # pyright: ignore[reportUnknownMemberType]
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
index=False, header=include_header, na_rep=""
)
text = cast(
str,
soupparser_fromstring( # pyright: ignore[reportUnknownMemberType]
html_text
).text_content(),
)
fix(xlsx): xlsx subtable algorithm (#2534) **Reviewers:** It may be easier to review each of the two commits separately. The first adds the new `_SubtableParser` object with its unit-tests and the second one uses that object to replace the flawed existing subtable-parsing algorithm. **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). This PR replaces the flawed existing algorithm with a `_SubtableParser` object that encapsulates all that logic and has thorough unit-tests. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes. This PR fixes all these bugs: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-13 20:29:17 -08:00
element = Table(text=text)
element.metadata = metadata
element.metadata.text_as_html = html_text if infer_table_structure else None
elements.append(element)
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
# -- no core-table is emitted if it's empty (all rows are single-cell rows) --
fix(xlsx): xlsx subtable algorithm (#2534) **Reviewers:** It may be easier to review each of the two commits separately. The first adds the new `_SubtableParser` object with its unit-tests and the second one uses that object to replace the flawed existing subtable-parsing algorithm. **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). This PR replaces the flawed existing algorithm with a `_SubtableParser` object that encapsulates all that logic and has thorough unit-tests. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes. This PR fixes all these bugs: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-13 20:29:17 -08:00
# -- emit each trailing single-cell row as its own `Text`-subtype element --
for content in subtable_parser.iter_trailing_single_cell_rows_texts():
element = _check_content_element_type(str(content))
element.metadata = metadata
elements.append(element)
elements = list(
apply_lang_metadata(
elements=elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
),
)
return elements
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
class _ConnectedComponent:
"""A collection of cells that are "2d-connected" in a worksheet.
2d-connected means there is a path from each cell to every other cell by traversing up, down,
left, or right (not diagonally).
"""
def __init__(self, worksheet: pd.DataFrame, cell_coordinate_set: set[_CellCoordinate]):
self._worksheet = worksheet
self._cell_coordinate_set = cell_coordinate_set
@lazyproperty
def max_x(self) -> int:
"""The right-most column index of the connected component."""
return self._extents[2]
def merge(self, other: _ConnectedComponent) -> _ConnectedComponent:
"""Produce new instance with union of cells in `self` and `other`.
Used to combine regions of workshet that are "overlapping" row-wise but not actually
2D-connected.
"""
return _ConnectedComponent(
self._worksheet, self._cell_coordinate_set.union(other._cell_coordinate_set)
)
@lazyproperty
def min_x(self) -> int:
"""The left-most column index of the connected component."""
return self._extents[0]
@lazyproperty
def subtable(self) -> pd.DataFrame:
"""The connected region of the worksheet as a `DataFrame`.
The subtable is the rectangular region of the worksheet inside the connected-component
bounding-box. Row-indices and column labels are preserved, not restarted at 0.
"""
min_x, min_y, max_x, max_y = self._extents
return self._worksheet.iloc[min_x : max_x + 1, min_y : max_y + 1]
@lazyproperty
def _extents(self) -> tuple[int, int, int, int]:
"""Compute bounding box of this connected component."""
min_x, min_y, max_x, max_y = float("inf"), float("inf"), float("-inf"), float("-inf")
for x, y in self._cell_coordinate_set:
if x < min_x:
min_x = x
if x > max_x:
max_x = x
if y < min_y:
min_y = y
if y > max_y:
max_y = y
return int(min_x), int(min_y), int(max_x), int(max_y)
class _ConnectedComponents:
"""The collection of connected-components for a single worksheet.
"Connected-components" refers to the graph algorithm we use to detect contiguous groups of
non-empty cells in an excel sheet.
"""
def __init__(self, worksheet_df: pd.DataFrame):
self._worksheet_df = worksheet_df
def __iter__(self) -> Iterator[_ConnectedComponent]:
return iter(self._connected_components)
@classmethod
def from_worksheet_df(cls, worksheet_df: pd.DataFrame) -> Self:
"""Construct from a worksheet dataframe produced by reading Excel with pandas."""
return cls(worksheet_df)
@lazyproperty
def _connected_components(self) -> list[_ConnectedComponent]:
"""The `_ConnectedComponent` objects comprising this collection."""
# -- produce a 2D-graph representing the populated cells of the worksheet (or subsheet).
# -- A 2D-graph relates each populated cell to the one above, below, left, and right of it.
max_row, max_col = self._worksheet_df.shape
node_array = np.indices((max_row, max_col)).T
empty_cells = self._worksheet_df.isna().T
nodes_to_remove = [tuple(pair) for pair in node_array[empty_cells]]
graph: nx.Graph = nx.grid_2d_graph(max_row, max_col) # pyright: ignore
graph.remove_nodes_from(nodes_to_remove) # pyright: ignore
# -- compute sets of nodes representing each connected-component --
connected_node_sets: Iterator[set[_CellCoordinate]]
connected_node_sets = nx.connected_components( # pyright: ignore[reportUnknownMemberType]
graph
)
return list(
self._merge_overlapping_tables(
[
_ConnectedComponent(self._worksheet_df, component_node_set)
for component_node_set in connected_node_sets
]
)
)
def _merge_overlapping_tables(
self, connected_components: list[_ConnectedComponent]
) -> Iterator[_ConnectedComponent]:
"""Merge connected-components that overlap row-wise.
A pair of overlapping components might look like one of these:
x x x x x
x x x x x
x x OR x x
x
x x x
"""
# -- order connected-components by their top row --
sorted_components = sorted(connected_components, key=lambda x: x.min_x)
current_component = None
for component in sorted_components:
# -- prime the pump --
if current_component is None:
current_component = component
continue
# -- merge this next component with prior if it overlaps row-wise. Note the merged
# -- component becomes the new current-component.
if component.min_x <= current_component.max_x:
current_component = current_component.merge(component)
# -- otherwise flush and move on --
else:
yield current_component
current_component = component
# -- flush last component --
if current_component is not None:
yield current_component
fix(xlsx): xlsx subtable algorithm (#2534) **Reviewers:** It may be easier to review each of the two commits separately. The first adds the new `_SubtableParser` object with its unit-tests and the second one uses that object to replace the flawed existing subtable-parsing algorithm. **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). This PR replaces the flawed existing algorithm with a `_SubtableParser` object that encapsulates all that logic and has thorough unit-tests. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes. This PR fixes all these bugs: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-13 20:29:17 -08:00
class _SubtableParser:
"""Distinguishes core-table from leading and trailing title rows in a subtable.
A *subtable* is a contiguous block of populated cells in the spreadsheet. Leading or trailing
rows of that block containing only one populated cell are called "single-cell rows" and are
not considered part of the core table. These are each emitted separately as a `Text`-subtype
element.
"""
def __init__(self, subtable: pd.DataFrame):
self._subtable = subtable
@lazyproperty
def core_table(self) -> pd.DataFrame | None:
"""The part between the leading and trailing single-cell rows, if any."""
core_table_start = len(self._leading_single_cell_row_indices)
# -- if core-table start is the end of table, there is no core-table
# -- (all rows are single-cell)
if core_table_start == len(self._subtable):
return None
# -- assert: there is at least one core-table row (leading single-cell rows greedily
# -- consumes all consecutive single-cell rows.
core_table_stop = len(self._subtable) - len(self._trailing_single_cell_row_indices)
# -- core-table is what's left in-between --
return self._subtable[core_table_start:core_table_stop]
def iter_leading_single_cell_rows_texts(self) -> Iterator[str]:
"""Generate the cell-text for each leading single-cell row."""
for row_idx in self._leading_single_cell_row_indices:
yield self._subtable.iloc[row_idx].dropna().iloc[0] # pyright: ignore
def iter_trailing_single_cell_rows_texts(self) -> Iterator[str]:
"""Generate the cell-text for each trailing single-cell row."""
for row_idx in self._trailing_single_cell_row_indices:
yield self._subtable.iloc[row_idx].dropna().iloc[0] # pyright: ignore
@lazyproperty
def _leading_single_cell_row_indices(self) -> tuple[int, ...]:
"""Index of each leading single-cell row in subtable, in top-down order."""
def iter_leading_single_cell_row_indices() -> Iterator[int]:
for next_row_idx, idx in enumerate(self._single_cell_row_indices):
if idx != next_row_idx:
return
yield next_row_idx
return tuple(iter_leading_single_cell_row_indices())
@lazyproperty
def _single_cell_row_indices(self) -> tuple[int, ...]:
"""Index of each single-cell row in subtable, in top-down order."""
def iter_single_cell_row_idxs() -> Iterator[int]:
for idx, (_, row) in enumerate(self._subtable.iterrows()): # pyright: ignore
if row.count() != 1:
continue
yield idx
return tuple(iter_single_cell_row_idxs())
@lazyproperty
def _trailing_single_cell_row_indices(self) -> tuple[int, ...]:
"""Index of each trailing single-cell row in subtable, in top-down order."""
# -- if all subtable rows are single-cell, then by convention they are all leading --
if len(self._leading_single_cell_row_indices) == len(self._subtable):
return ()
def iter_trailing_single_cell_row_indices() -> Iterator[int]:
"""... moving from end upward ..."""
next_row_idx = len(self._subtable) - 1
for idx in self._single_cell_row_indices[::-1]:
if idx != next_row_idx:
return
yield next_row_idx
next_row_idx -= 1
return tuple(reversed(list(iter_trailing_single_cell_row_indices())))
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
def _check_content_element_type(text: str) -> Element:
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
"""Create `Text`-subtype document element appropriate to `text`."""
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
if is_bulleted_text(text):
fix(xlsx): xlsx subtable algorithm (#2534) **Reviewers:** It may be easier to review each of the two commits separately. The first adds the new `_SubtableParser` object with its unit-tests and the second one uses that object to replace the flawed existing subtable-parsing algorithm. **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). This PR replaces the flawed existing algorithm with a `_SubtableParser` object that encapsulates all that logic and has thorough unit-tests. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes. This PR fixes all these bugs: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-13 20:29:17 -08:00
return ListItem(text=clean_bullets(text))
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
elif is_possible_numbered_list(text):
fix(xlsx): xlsx subtable algorithm (#2534) **Reviewers:** It may be easier to review each of the two commits separately. The first adds the new `_SubtableParser` object with its unit-tests and the second one uses that object to replace the flawed existing subtable-parsing algorithm. **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). This PR replaces the flawed existing algorithm with a `_SubtableParser` object that encapsulates all that logic and has thorough unit-tests. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes. This PR fixes all these bugs: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-13 20:29:17 -08:00
return ListItem(text=text)
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
elif is_possible_narrative_text(text):
fix(xlsx): xlsx subtable algorithm (#2534) **Reviewers:** It may be easier to review each of the two commits separately. The first adds the new `_SubtableParser` object with its unit-tests and the second one uses that object to replace the flawed existing subtable-parsing algorithm. **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). This PR replaces the flawed existing algorithm with a `_SubtableParser` object that encapsulates all that logic and has thorough unit-tests. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes. This PR fixes all these bugs: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-13 20:29:17 -08:00
return NarrativeText(text=text)
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
elif is_possible_title(text):
fix(xlsx): xlsx subtable algorithm (#2534) **Reviewers:** It may be easier to review each of the two commits separately. The first adds the new `_SubtableParser` object with its unit-tests and the second one uses that object to replace the flawed existing subtable-parsing algorithm. **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). This PR replaces the flawed existing algorithm with a `_SubtableParser` object that encapsulates all that logic and has thorough unit-tests. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes. This PR fixes all these bugs: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-13 20:29:17 -08:00
return Title(text=text)
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
else:
fix(xlsx): xlsx subtable algorithm (#2534) **Reviewers:** It may be easier to review each of the two commits separately. The first adds the new `_SubtableParser` object with its unit-tests and the second one uses that object to replace the flawed existing subtable-parsing algorithm. **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). This PR replaces the flawed existing algorithm with a `_SubtableParser` object that encapsulates all that logic and has thorough unit-tests. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes. This PR fixes all these bugs: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-13 20:29:17 -08:00
return Text(text=text)
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
def _get_metadata(
include_metadata: bool = True,
sheet_name: Optional[str] = None,
page_number: Optional[int] = -1,
filename: Optional[str] = None,
rfctr(xlsx): cleaning in prep for XLSX algorithm replacement (#2524) **Reviewers:** It may be faster to review each of the three commits separately since they are groomed to only make one type of change each (typing, docstrings, test-cleanup). **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). These commits clean up typing, lint, and other non-behavior-changing aspects of the code in preparation for installing a new algorithm that correctly identifies and partitions contiguous sub-regions of an Excel worksheet into distinct elements. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-08 15:33:41 -08:00
last_modification_date: Optional[str] = None,
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
) -> ElementMetadata:
"""Returns metadata depending on `include_metadata` flag"""
if include_metadata:
metadata = ElementMetadata(
page_name=sheet_name,
page_number=page_number,
filename=filename,
last_modified=last_modification_date,
)
else:
metadata = ElementMetadata()
return metadata