2023-09-26 19:24:21 -04:00
|
|
|
#
|
|
|
|
# This file is autogenerated by pip-compile with Python 3.8
|
|
|
|
# by the following command:
|
|
|
|
#
|
|
|
|
# pip-compile requirements/ingest-openai.in
|
|
|
|
#
|
|
|
|
aiohttp==3.8.5
|
|
|
|
# via
|
|
|
|
# langchain
|
|
|
|
# openai
|
|
|
|
aiosignal==1.3.1
|
|
|
|
# via aiohttp
|
2023-09-29 14:09:57 -05:00
|
|
|
anyio==3.7.1
|
|
|
|
# via
|
|
|
|
# -c requirements/constraints.in
|
|
|
|
# langchain
|
2023-09-26 19:24:21 -04:00
|
|
|
async-timeout==4.0.3
|
|
|
|
# via
|
|
|
|
# aiohttp
|
|
|
|
# langchain
|
|
|
|
attrs==23.1.0
|
|
|
|
# via aiohttp
|
|
|
|
certifi==2023.7.22
|
|
|
|
# via
|
|
|
|
# -c requirements/base.txt
|
|
|
|
# -c requirements/constraints.in
|
|
|
|
# requests
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
charset-normalizer==3.3.0
|
2023-09-26 19:24:21 -04:00
|
|
|
# via
|
|
|
|
# -c requirements/base.txt
|
|
|
|
# aiohttp
|
|
|
|
# requests
|
|
|
|
dataclasses-json==0.6.1
|
|
|
|
# via
|
|
|
|
# -c requirements/base.txt
|
|
|
|
# langchain
|
2023-09-29 14:09:57 -05:00
|
|
|
exceptiongroup==1.1.3
|
|
|
|
# via anyio
|
2023-09-26 19:24:21 -04:00
|
|
|
frozenlist==1.4.0
|
|
|
|
# via
|
|
|
|
# aiohttp
|
|
|
|
# aiosignal
|
|
|
|
idna==3.4
|
|
|
|
# via
|
|
|
|
# -c requirements/base.txt
|
2023-09-29 14:09:57 -05:00
|
|
|
# anyio
|
2023-09-26 19:24:21 -04:00
|
|
|
# requests
|
|
|
|
# yarl
|
2023-09-29 14:09:57 -05:00
|
|
|
jsonpatch==1.33
|
|
|
|
# via langchain
|
|
|
|
jsonpointer==2.4
|
|
|
|
# via jsonpatch
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
langchain==0.0.305
|
2023-09-26 19:24:21 -04:00
|
|
|
# via -r requirements/ingest-openai.in
|
feat: get embedded url, associate text and start index for pdf (#1539)
**Executive Summary**
Adds PDF functionality to capture hyperlink (external or internal) for
pdf fast strategy along with associate text.
**Technical Details**
- `pdfminer` associates `annotation` (links and uris) with bounding box
rather than text. Therefore, the link and text matching is not a perfect
pair but rather a logic-based and calculation matching from bounding box
overlapping.
- There is no word-level bounding box. Only character-level (access
using `LTChar`). Thus in order to get to word-level, there is a window
slicing through the text. The words are captured in alphanumeric and
non-alphanumeric separately, meaning it will split the word if contains
both, on the first encounter of non-alphanumeric.)
- The bounding box calculation is calculated using start and stop
coordinates for the corresponding word calculated from above. The
calculation is simply using distance between two dots.
The result now contains `links` in `metadata` as shown below:
```
"links": [
{
"text": "link",
"url": "https://github.com/Unstructured-IO/unstructured",
"start_index": 12
},
{
"text": "email",
"url": "mailto:unstructuredai@earlygrowth.com",
"start_index": 30
},
{
"text": "phone number",
"url": "tel:6505124019",
"start_index": 49
}
]
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-27 13:43:32 -04:00
|
|
|
langsmith==0.0.41
|
2023-09-26 19:24:21 -04:00
|
|
|
# via langchain
|
|
|
|
marshmallow==3.20.1
|
|
|
|
# via
|
|
|
|
# -c requirements/base.txt
|
|
|
|
# dataclasses-json
|
|
|
|
multidict==6.0.4
|
|
|
|
# via
|
|
|
|
# aiohttp
|
|
|
|
# yarl
|
|
|
|
mypy-extensions==1.0.0
|
|
|
|
# via
|
|
|
|
# -c requirements/base.txt
|
|
|
|
# typing-inspect
|
|
|
|
numexpr==2.8.6
|
|
|
|
# via langchain
|
|
|
|
numpy==1.24.4
|
|
|
|
# via
|
feat: get embedded url, associate text and start index for pdf (#1539)
**Executive Summary**
Adds PDF functionality to capture hyperlink (external or internal) for
pdf fast strategy along with associate text.
**Technical Details**
- `pdfminer` associates `annotation` (links and uris) with bounding box
rather than text. Therefore, the link and text matching is not a perfect
pair but rather a logic-based and calculation matching from bounding box
overlapping.
- There is no word-level bounding box. Only character-level (access
using `LTChar`). Thus in order to get to word-level, there is a window
slicing through the text. The words are captured in alphanumeric and
non-alphanumeric separately, meaning it will split the word if contains
both, on the first encounter of non-alphanumeric.)
- The bounding box calculation is calculated using start and stop
coordinates for the corresponding word calculated from above. The
calculation is simply using distance between two dots.
The result now contains `links` in `metadata` as shown below:
```
"links": [
{
"text": "link",
"url": "https://github.com/Unstructured-IO/unstructured",
"start_index": 12
},
{
"text": "email",
"url": "mailto:unstructuredai@earlygrowth.com",
"start_index": 30
},
{
"text": "phone number",
"url": "tel:6505124019",
"start_index": 49
}
]
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-27 13:43:32 -04:00
|
|
|
# -c requirements/base.txt
|
2023-09-26 19:24:21 -04:00
|
|
|
# -c requirements/constraints.in
|
|
|
|
# langchain
|
|
|
|
# numexpr
|
|
|
|
openai==0.28.1
|
|
|
|
# via -r requirements/ingest-openai.in
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
packaging==23.2
|
2023-09-26 19:24:21 -04:00
|
|
|
# via
|
|
|
|
# -c requirements/base.txt
|
|
|
|
# marshmallow
|
2023-09-29 14:09:57 -05:00
|
|
|
pydantic==1.10.13
|
2023-09-26 19:24:21 -04:00
|
|
|
# via
|
|
|
|
# -c requirements/constraints.in
|
|
|
|
# langchain
|
|
|
|
# langsmith
|
|
|
|
pyyaml==6.0.1
|
|
|
|
# via langchain
|
|
|
|
regex==2023.8.8
|
|
|
|
# via
|
|
|
|
# -c requirements/base.txt
|
|
|
|
# tiktoken
|
|
|
|
requests==2.31.0
|
|
|
|
# via
|
|
|
|
# -c requirements/base.txt
|
|
|
|
# langchain
|
|
|
|
# langsmith
|
|
|
|
# openai
|
|
|
|
# tiktoken
|
2023-09-29 14:09:57 -05:00
|
|
|
sniffio==1.3.0
|
|
|
|
# via anyio
|
2023-09-26 19:24:21 -04:00
|
|
|
sqlalchemy==2.0.21
|
|
|
|
# via langchain
|
|
|
|
tenacity==8.2.3
|
|
|
|
# via langchain
|
|
|
|
tiktoken==0.5.1
|
|
|
|
# via -r requirements/ingest-openai.in
|
|
|
|
tqdm==4.66.1
|
|
|
|
# via
|
|
|
|
# -c requirements/base.txt
|
|
|
|
# openai
|
|
|
|
typing-extensions==4.8.0
|
|
|
|
# via
|
|
|
|
# -c requirements/base.txt
|
|
|
|
# pydantic
|
|
|
|
# sqlalchemy
|
|
|
|
# typing-inspect
|
|
|
|
typing-inspect==0.9.0
|
|
|
|
# via
|
|
|
|
# -c requirements/base.txt
|
|
|
|
# dataclasses-json
|
|
|
|
urllib3==1.26.16
|
|
|
|
# via
|
|
|
|
# -c requirements/base.txt
|
|
|
|
# -c requirements/constraints.in
|
|
|
|
# requests
|
|
|
|
yarl==1.9.2
|
|
|
|
# via aiohttp
|