unstructured/requirements/huggingface.txt

112 lines
2.3 KiB
Plaintext
Raw Normal View History

#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/huggingface.in
#
certifi==2023.7.22
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests
click==8.1.7
# via
# -c requirements/base.txt
# sacremoses
[CORE-1741] use forked pytesseract to reduce calls to tesseract (#1298) This PR resolves [CORE-1741](https://unstructured-ai.atlassian.net/browse/CORE-1741) by using a new function `pytesseract.run_and_get_multiple_output`, see forked repo for more details: https://github.com/Unstructured-IO/unstructured.pytesseract/releases/tag/0.3.11-dev1 This reduces the call to `tesseract` by half per page of PDF/image during partition, roughly reducing the runtime by 48%. The new function is in forked `unstructured.pytesseract`. A PR has been made to the upstream repo and once that is merged we should switch to the up stream version. For now we add a new dependency: `unstructured.pytesseract`. ## testing Existing unit tests should serve as tests to the new function. To demonstrate the changes in performance: - checkout main - run `./scripts/performance/profile.sh` and select `ocr_only` strategy, using the 10th document (16 page layout paper in pdf format) - examine the speedscope profile or time profile in flamegraph -> should see two dominant time spenders are `pytesseract.image_to_text` and `pytesseract.image_to_boxes`, with both about the same total time (see attached first image) - checkout this branch - run the same `profile.sh` with the same options - examine the profile again and this time should notice 1) total runtime is reduced by more than 40%; 2) only `unstructured_pytesseract.run_and_get_multiple_output` is the top time spender and its total time is about the same as either the `pytesseract.image_to_text` or `pytesseract.image_to_boxes` time (see second image below) ![Screenshot 2023-09-06 at 9 45 10 AM](https://github.com/Unstructured-IO/unstructured/assets/647930/fed6118b-a0dc-493d-bef8-85d73027c968) ![Screenshot 2023-09-06 at 9 46 37 AM](https://github.com/Unstructured-IO/unstructured/assets/647930/dd1d6369-cfba-43d4-b1c6-87a8a98b2e16) [CORE-1741]: https://unstructured-ai.atlassian.net/browse/CORE-1741?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ --------- Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com> Co-authored-by: cragwolfe <crag@unstructured.io>
2023-09-14 18:27:18 -05:00
filelock==3.12.4
# via
# huggingface-hub
# torch
# transformers
fsspec==2023.9.1
# via
# -c requirements/constraints.in
# huggingface-hub
huggingface-hub==0.17.3
# via transformers
idna==3.4
# via
# -c requirements/base.txt
# requests
jinja2==3.1.2
# via torch
joblib==1.3.2
# via
# -c requirements/base.txt
# sacremoses
langdetect==1.0.9
# via
# -c requirements/base.txt
# -r requirements/huggingface.in
markupsafe==2.1.3
# via jinja2
mpmath==1.3.0
# via sympy
networkx==3.1
# via torch
numpy==1.24.4
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# transformers
feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: <img width="747" alt="image" src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2"> ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ```
2023-10-04 13:30:23 -04:00
packaging==23.2
# via
# -c requirements/base.txt
# huggingface-hub
# transformers
pyyaml==6.0.1
# via
# huggingface-hub
# transformers
regex==2023.8.8
# via
# -c requirements/base.txt
# sacremoses
# transformers
requests==2.31.0
# via
# -c requirements/base.txt
# huggingface-hub
# transformers
sacremoses==0.0.53
# via -r requirements/huggingface.in
safetensors==0.3.2
# via
# -c requirements/constraints.in
# transformers
sentencepiece==0.1.99
# via -r requirements/huggingface.in
six==1.16.0
# via
# -c requirements/base.txt
# langdetect
# sacremoses
sympy==1.12
# via torch
tokenizers==0.13.3
# via transformers
torch==2.0.1
# via -r requirements/huggingface.in
tqdm==4.66.1
# via
# -c requirements/base.txt
# huggingface-hub
# sacremoses
# transformers
transformers==4.33.3
# via -r requirements/huggingface.in
typing-extensions==4.8.0
# via
# -c requirements/base.txt
# huggingface-hub
# torch
urllib3==1.26.16
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# requests