feat: xlsx subtable extraction (#1585)

**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.

**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text

**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">

```
[
    {
        "type": "Title",
        "element_id": "3315afd97f7f2ebcd450e7c939878429",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Financial performance"
    },
    {
        "type": "Table",
        "element_id": "17f5d512705be6f8812e5dbb801ba727",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "8a9db7161a02b427f8fda883656036e1",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Operational metrics"
    },
    {
        "type": "Table",
        "element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Other"
    },
    {
        "type": "Table",
        "element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
    }
]
```
This commit is contained in:
Klaijan 2023-10-04 13:30:23 -04:00 committed by GitHub
parent 19d8bff275
commit 0a65fc2134
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
39 changed files with 1033 additions and 301 deletions

View File

@ -2,6 +2,7 @@
### Enhancements
* **Adds XLSX document level language detection** Enhancing on top of language detection functionality in previous release, we now support language detection within `.xlsx` file type at Element level.
* **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images.
* **Detect text in HTML Heading Tags as Titles** This will increase the accuracy of hierarchies in HTML documents and provide more accurate element categorization. If text is in an HTML heading tag and is not a list item, address, or narrative text, categorize it as a title.
* **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself.
@ -10,7 +11,7 @@
* **Expose endpoint url for s3 connectors** By allowing for the endpoint url to be explicitly overwritten, this allows for any non-AWS data providers supporting the s3 protocol to be supported (i.e. minio).
* **change default `hi_res` model for pdf/image partition to `yolox`** Now partitioning pdf/image using `hi_res` strategy utilizes `yolox_quantized` model isntead of `detectron2_onnx` model. This new default model has better recall for tables and produces more detailed categories for elements.
### Features
* **XLSX can now reads subtables within one sheet** Problem: Many .xlsx files are not created to be read as one full table per sheet. There are subtables, text and header along with more informations to extract from each sheet. Feature: This `partition_xlsx` now can reads subtable(s) within one .xlsx sheet, along with extracting other title and narrative texts. Importance: This enhance the power of .xlsx reading to not only one table per sheet, allowing user to capture more data tables from the file, if exists.
### Fixes

View File

@ -18,7 +18,7 @@ certifi==2023.7.22
# -c requirements/constraints.in
# -r requirements/build.in
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests
@ -54,7 +54,7 @@ mdurl==0.1.2
# via markdown-it-py
myst-parser==2.0.0
# via -r requirements/build.in
packaging==23.1
packaging==23.2
# via
# -c requirements/base.txt
# sphinx

Binary file not shown.

BIN
example-docs/vodafone.xlsx Normal file

Binary file not shown.

View File

@ -12,7 +12,7 @@ certifi==2023.7.22
# requests
chardet==5.2.0
# via -r requirements/base.in
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via requests
click==8.1.7
# via nltk
@ -40,7 +40,7 @@ numpy==1.24.4
# via
# -c requirements/constraints.in
# -r requirements/base.in
packaging==23.1
packaging==23.2
# via marshmallow
python-iso639==2023.6.15
# via -r requirements/base.in

View File

@ -18,7 +18,7 @@ certifi==2023.7.22
# -c requirements/constraints.in
# -r requirements/build.in
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests
@ -54,7 +54,7 @@ mdurl==0.1.2
# via markdown-it-py
myst-parser==2.0.0
# via -r requirements/build.in
packaging==23.1
packaging==23.2
# via
# -c requirements/base.txt
# sphinx

View File

@ -13,21 +13,18 @@ appnope==0.1.3
# ipykernel
# ipython
argon2-cffi==23.1.0
# via jupyter-server
# via
# jupyter-server
# nbclassic
# notebook
argon2-cffi-bindings==21.2.0
# via argon2-cffi
arrow==1.2.3
# via isoduration
asttokens==2.4.0
# via stack-data
async-lru==2.0.4
# via jupyterlab
attrs==23.1.0
# via
# jsonschema
# referencing
babel==2.12.1
# via jupyterlab-server
backcall==0.2.0
# via ipython
beautifulsoup4==4.12.2
@ -36,23 +33,10 @@ beautifulsoup4==4.12.2
# nbconvert
bleach==6.0.0
# via nbconvert
build==1.0.3
# via pip-tools
certifi==2023.7.22
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# -c requirements/test.txt
# requests
cffi==1.16.0
# via argon2-cffi-bindings
cfgv==3.4.0
# via pre-commit
charset-normalizer==3.2.0
# via
# -c requirements/base.txt
# -c requirements/test.txt
# requests
click==8.1.7
# via
# -c requirements/base.txt
@ -70,47 +54,40 @@ defusedxml==0.7.1
# via nbconvert
distlib==0.3.7
# via virtualenv
entrypoints==0.4
# via
# jupyter-client
# nbconvert
exceptiongroup==1.1.3
# via
# -c requirements/test.txt
# anyio
executing==1.2.0
executing==2.0.0
# via stack-data
fastjsonschema==2.18.0
fastjsonschema==2.18.1
# via nbformat
filelock==3.12.4
# via virtualenv
fqdn==1.5.1
# via jsonschema
identify==2.5.29
identify==2.5.30
# via pre-commit
idna==3.4
# via
# -c requirements/base.txt
# -c requirements/test.txt
# anyio
# jsonschema
# requests
importlib-metadata==6.8.0
# via
# build
# jupyter-client
# jupyter-lsp
# jupyterlab
# jupyterlab-server
# nbconvert
importlib-resources==6.1.0
# via
# jsonschema
# jsonschema-specifications
# jupyterlab
ipykernel==6.25.2
# notebook
ipykernel==6.11.0
# via
# jupyter
# jupyter-console
# jupyterlab
# nbclassic
# notebook
# qtconsole
ipython==8.12.2
ipython==8.12.3
# via
# -c requirements/constraints.in
# -r requirements/dev.in
@ -118,74 +95,55 @@ ipython==8.12.2
# ipywidgets
# jupyter-console
ipython-genutils==0.2.0
# via qtconsole
# via
# jupyter-server
# nbclassic
# notebook
# qtconsole
ipywidgets==8.1.1
# via jupyter
isoduration==20.11.0
# via jsonschema
jedi==0.19.0
# via ipython
jinja2==3.1.2
# via
# jupyter-server
# jupyterlab
# jupyterlab-server
# nbclassic
# nbconvert
json5==0.9.14
# via jupyterlab-server
jsonpointer==2.4
# via jsonschema
jsonschema[format-nongpl]==4.19.1
# via
# jupyter-events
# jupyterlab-server
# nbformat
# notebook
jsonschema==4.19.1
# via nbformat
jsonschema-specifications==2023.7.1
# via jsonschema
jupyter==1.0.0
# via -r requirements/dev.in
jupyter-client==8.3.1
jupyter-client==7.4.9
# via
# ipykernel
# jupyter-console
# jupyter-server
# nbclassic
# nbclient
# notebook
# qtconsole
jupyter-console==6.6.3
jupyter-console==6.4.4
# via jupyter
jupyter-core==5.3.2
# via
# -c requirements/constraints.in
# ipykernel
# jupyter-client
# jupyter-console
# jupyter-server
# jupyterlab
# nbclient
# nbclassic
# nbconvert
# nbformat
# qtconsole
jupyter-events==0.7.0
# via jupyter-server
jupyter-lsp==2.2.0
# via jupyterlab
jupyter-server==2.7.3
# via
# jupyter-lsp
# jupyterlab
# jupyterlab-server
# notebook
# qtconsole
jupyter-server==1.13.1
# via
# nbclassic
# notebook-shim
jupyter-server-terminals==0.4.4
# via jupyter-server
jupyterlab==4.0.6
# via notebook
jupyterlab-pygments==0.2.2
# via nbconvert
jupyterlab-server==2.25.0
# via
# jupyterlab
# notebook
jupyterlab-widgets==3.0.9
# via ipywidgets
markupsafe==2.1.3
@ -196,52 +154,53 @@ matplotlib-inline==0.1.6
# via
# ipykernel
# ipython
mistune==3.0.1
mistune==0.8.4
# via nbconvert
nbclient==0.8.0
nbclassic==1.0.0
# via notebook
nbclient==0.5.13
# via nbconvert
nbconvert==7.8.0
nbconvert==6.4.5
# via
# jupyter
# jupyter-server
# nbclassic
# notebook
nbformat==5.9.2
# via
# jupyter-server
# nbclassic
# nbclient
# nbconvert
# notebook
nest-asyncio==1.5.8
# via ipykernel
# via
# ipykernel
# jupyter-client
# nbclassic
# nbclient
# notebook
nodeenv==1.8.0
# via pre-commit
notebook==7.0.4
notebook==6.5.6
# via jupyter
notebook-shim==0.2.3
# via
# jupyterlab
# nbclassic
# notebook
overrides==7.4.0
# via jupyter-server
packaging==23.1
# via
# -c requirements/base.txt
# -c requirements/test.txt
# build
# ipykernel
# jupyter-server
# jupyterlab
# jupyterlab-server
# nbconvert
# qtconsole
# qtpy
pandocfilters==1.5.0
# via nbconvert
parso==0.8.3
# via jedi
pep517==0.13.0
# via
# build
# pip-tools
pexpect==4.8.0
# via ipython
pickleshare==0.7.5
# via ipython
pip-tools==7.3.0
pip-tools==6.6.2
# via -r requirements/dev.in
pkgutil-resolve-name==1.3.10
# via jsonschema
@ -253,7 +212,10 @@ platformdirs==3.10.0
pre-commit==3.4.0
# via -r requirements/dev.in
prometheus-client==0.17.1
# via jupyter-server
# via
# jupyter-server
# nbclassic
# notebook
prompt-toolkit==3.0.39
# via
# ipython
@ -274,57 +236,38 @@ pygments==2.16.1
# jupyter-console
# nbconvert
# qtconsole
pyproject-hooks==1.0.0
# via build
python-dateutil==2.8.2
# via
# -c requirements/test.txt
# arrow
# jupyter-client
python-json-logger==2.0.7
# via jupyter-events
pytz==2023.3.post1
# via babel
pyyaml==6.0.1
# via
# -c requirements/test.txt
# jupyter-events
# pre-commit
pyzmq==25.1.1
pyzmq==24.0.1
# via
# ipykernel
# jupyter-client
# jupyter-console
# jupyter-server
# nbclassic
# notebook
# qtconsole
qtconsole==5.4.4
qtconsole==5.2.2
# via jupyter
qtpy==2.4.0
qtpy==1.11.3
# via qtconsole
referencing==0.30.2
# via
# jsonschema
# jsonschema-specifications
# jupyter-events
requests==2.31.0
# via
# -c requirements/base.txt
# -c requirements/test.txt
# jupyterlab-server
rfc3339-validator==0.1.4
# via
# jsonschema
# jupyter-events
rfc3986-validator==0.1.1
# via
# jsonschema
# jupyter-events
rpds-py==0.10.3
# via
# jsonschema
# referencing
send2trash==1.8.2
# via jupyter-server
# via
# jupyter-server
# nbclassic
# notebook
six==1.16.0
# via
# -c requirements/base.txt
@ -332,34 +275,32 @@ six==1.16.0
# asttokens
# bleach
# python-dateutil
# rfc3339-validator
sniffio==1.3.0
# via anyio
soupsieve==2.5
# via
# -c requirements/base.txt
# beautifulsoup4
stack-data==0.6.2
stack-data==0.6.3
# via ipython
terminado==0.17.1
# via
# jupyter-server
# jupyter-server-terminals
tinycss2==1.2.1
# nbclassic
# notebook
testpath==0.6.0
# via nbconvert
tomli==2.0.1
# via
# -c requirements/test.txt
# build
# jupyterlab
# pep517
# pip-tools
# pyproject-hooks
tornado==6.3.3
# via
# ipykernel
# jupyter-client
# jupyter-server
# jupyterlab
# nbclassic
# notebook
# terminado
traitlets==5.10.1
@ -369,40 +310,26 @@ traitlets==5.10.1
# ipython
# ipywidgets
# jupyter-client
# jupyter-console
# jupyter-core
# jupyter-events
# jupyter-server
# jupyterlab
# matplotlib-inline
# nbclassic
# nbclient
# nbconvert
# nbformat
# notebook
# qtconsole
typing-extensions==4.8.0
# via
# -c requirements/base.txt
# -c requirements/test.txt
# async-lru
# ipython
uri-template==1.3.0
# via jsonschema
urllib3==1.26.16
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# -c requirements/test.txt
# requests
virtualenv==20.24.5
# via pre-commit
wcwidth==0.2.7
wcwidth==0.2.8
# via prompt-toolkit
webcolors==1.13
# via jsonschema
webencodings==0.5.1
# via
# bleach
# tinycss2
# via bleach
websocket-client==1.6.3
# via jupyter-server
wheel==0.41.2
@ -412,9 +339,7 @@ wheel==0.41.2
widgetsnbextension==4.0.9
# via ipywidgets
zipp==3.17.0
# via
# importlib-metadata
# importlib-resources
# via importlib-resources
# The following packages are considered to be unsafe in a requirements file:
# pip

View File

@ -19,7 +19,7 @@ certifi==2023.7.22
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests
@ -39,13 +39,13 @@ cython==3.0.2
# via unstructured-paddleocr
et-xmlfile==1.1.0
# via openpyxl
flask==2.3.3
flask==3.0.0
# via
# flask-babel
# visualdl
flask-babel==3.1.0
# via visualdl
fonttools==4.42.1
fonttools==4.43.0
# via matplotlib
future==0.18.3
# via bce-python-sdk
@ -122,7 +122,7 @@ opencv-python==4.8.0.76
# unstructured-paddleocr
openpyxl==3.1.2
# via unstructured-paddleocr
packaging==23.1
packaging==23.2
# via
# -c requirements/base.txt
# matplotlib
@ -218,7 +218,7 @@ urllib3==1.26.16
# requests
visualdl==2.5.3
# via unstructured-paddleocr
werkzeug==2.3.7
werkzeug==3.0.0
# via flask
zipp==3.17.0
# via

View File

@ -13,7 +13,7 @@ certifi==2023.7.22
# requests
cffi==1.16.0
# via cryptography
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# pdfminer-six
@ -35,7 +35,7 @@ filelock==3.12.4
# transformers
flatbuffers==23.5.26
# via onnxruntime
fonttools==4.42.1
fonttools==4.43.0
# via matplotlib
fsspec==2023.9.1
# via
@ -98,7 +98,7 @@ opencv-python==4.8.0.76
# -c requirements/constraints.in
# layoutparser
# unstructured-inference
packaging==23.1
packaging==23.2
# via
# -c requirements/base.txt
# huggingface-hub

View File

@ -10,5 +10,5 @@ pillow==10.0.1
# via python-pptx
python-pptx==0.6.21
# via -r requirements/extra-pptx.in
xlsxwriter==3.1.5
xlsxwriter==3.1.6
# via python-pptx

View File

@ -9,7 +9,7 @@ certifi==2023.7.22
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests
@ -53,7 +53,7 @@ numpy==1.24.4
# -c requirements/base.txt
# -c requirements/constraints.in
# transformers
packaging==23.1
packaging==23.2
# via
# -c requirements/base.txt
# huggingface-hub

View File

@ -9,7 +9,7 @@ certifi==2023.7.22
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests

View File

@ -18,7 +18,7 @@ certifi==2023.7.22
# -c requirements/constraints.in
# msrest
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests

View File

@ -34,7 +34,7 @@ cffi==1.16.0
# via
# azure-datalake-store
# cryptography
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# aiohttp
@ -61,7 +61,7 @@ idna==3.4
# yarl
isodate==0.6.1
# via azure-storage-blob
msal==1.24.0
msal==1.24.1
# via
# azure-datalake-store
# azure-identity

View File

@ -17,7 +17,7 @@ certifi==2023.7.22
# requests
cffi==1.16.0
# via cryptography
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests

View File

@ -11,7 +11,7 @@ certifi==2023.7.22
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests

View File

@ -12,7 +12,7 @@ async-timeout==4.0.3
# via aiohttp
attrs==23.1.0
# via aiohttp
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# aiohttp

View File

@ -9,7 +9,7 @@ certifi==2023.7.22
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests

View File

@ -25,7 +25,7 @@ certifi==2023.7.22
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# aiohttp

View File

@ -13,7 +13,7 @@ cffi==1.16.0
# via
# cryptography
# pynacl
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests
@ -27,20 +27,31 @@ idna==3.4
# requests
pycparser==2.21
# via cffi
pygithub==1.59.1
pygithub==2.1.1
# via -r requirements/ingest-github.in
pyjwt[crypto]==2.8.0
# via pygithub
pynacl==1.5.0
# via pygithub
python-dateutil==2.8.2
# via pygithub
requests==2.31.0
# via
# -c requirements/base.txt
# pygithub
six==1.16.0
# via
# -c requirements/base.txt
# python-dateutil
typing-extensions==4.8.0
# via
# -c requirements/base.txt
# pygithub
urllib3==1.26.16
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# pygithub
# requests
wrapt==1.15.0
# via deprecated

View File

@ -9,7 +9,7 @@ certifi==2023.7.22
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests

View File

@ -11,7 +11,7 @@ certifi==2023.7.22
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests

View File

@ -11,7 +11,7 @@ certifi==2023.7.22
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests

View File

@ -17,7 +17,7 @@ certifi==2023.7.22
# requests
cffi==1.16.0
# via cryptography
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests
@ -29,7 +29,7 @@ idna==3.4
# via
# -c requirements/base.txt
# requests
msal==1.24.0
msal==1.24.1
# via
# -r requirements/ingest-onedrive.in
# office365-rest-python-client

View File

@ -25,7 +25,7 @@ certifi==2023.7.22
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# aiohttp
@ -50,7 +50,7 @@ jsonpatch==1.33
# via langchain
jsonpointer==2.4
# via jsonpatch
langchain==0.0.304
langchain==0.0.305
# via -r requirements/ingest-openai.in
langsmith==0.0.41
# via langchain
@ -76,7 +76,7 @@ numpy==1.24.4
# numexpr
openai==0.28.1
# via -r requirements/ingest-openai.in
packaging==23.1
packaging==23.2
# via
# -c requirements/base.txt
# marshmallow

View File

@ -11,7 +11,7 @@ certifi==2023.7.22
# requests
cffi==1.16.0
# via cryptography
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests
@ -23,7 +23,7 @@ idna==3.4
# via
# -c requirements/base.txt
# requests
msal==1.24.0
msal==1.24.1
# via
# -r requirements/ingest-outlook.in
# office365-rest-python-client

View File

@ -9,7 +9,7 @@ certifi==2023.7.22
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests

View File

@ -20,7 +20,7 @@ attrs==23.1.0
# via aiohttp
botocore==1.31.17
# via aiobotocore
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# aiohttp

View File

@ -13,7 +13,7 @@ certifi==2023.7.22
# requests
cffi==1.16.0
# via cryptography
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests

View File

@ -11,7 +11,7 @@ certifi==2023.7.22
# requests
cffi==1.16.0
# via cryptography
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests
@ -23,7 +23,7 @@ idna==3.4
# via
# -c requirements/base.txt
# requests
msal==1.24.0
msal==1.24.1
# via
# -r requirements/ingest-sharepoint.in
# office365-rest-python-client

View File

@ -13,7 +13,7 @@ certifi==2023.7.22
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests

View File

@ -13,7 +13,7 @@ certifi==2023.7.22
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.2.0
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# requests
@ -32,7 +32,7 @@ flake8==6.1.0
# via -r requirements/test.in
freezegun==1.2.2
# via -r requirements/test.in
grpcio==1.58.0
grpcio==1.59.0
# via -r requirements/test.in
idna==3.4
# via
@ -61,7 +61,7 @@ mypy-extensions==1.0.0
# -c requirements/base.txt
# black
# mypy
packaging==23.1
packaging==23.2
# via
# -c requirements/base.txt
# black

View File

@ -8,7 +8,7 @@ from unittest.mock import patch
import docx
import pytest
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT, EXPECTED_TITLE
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import (
@ -708,36 +708,51 @@ EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsh
def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
elements = partition(filename=filename, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
assert sum(isinstance(element, Table) for element in elements) == 2
assert sum(isinstance(element, Title) for element in elements) == 2
assert len(elements) == 4
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.page_number == 1
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT
assert elements[1].metadata.text_as_html == EXPECTED_TABLE
assert elements[1].metadata.page_number == 1
assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition(file=f, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
assert sum(isinstance(element, Table) for element in elements) == 2
assert sum(isinstance(element, Title) for element in elements) == 2
assert len(elements) == 4
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.page_number == 1
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT
assert elements[1].metadata.text_as_html == EXPECTED_TABLE
assert elements[1].metadata.page_number == 1
assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
EXPECTED_XLS_TEXT_LEN = 507
EXPECTED_XLS_TEXT_LEN = 550
EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MA What C datatypes are 8 bits? (assume i386)"
EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MC What is 2+2? 4 correct 3 incorrect MA What"
EXPECTED_XLS_TABLE = (
"""<table border="1" class="dataframe">
<tbody>
<tr>
<td>MC</td>
<td>What is 2+2?</td>
<td>4</td>
<td>correct</td>
<td>3</td>
<td>incorrect</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>MA</td>
<td>What C datatypes are 8 bits? (assume i386)</td>
@ -814,8 +829,8 @@ EXPECTED_XLS_TABLE = (
def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
elements = partition(filename=filename, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 3
assert sum(isinstance(element, Table) for element in elements) == 2
assert len(elements) == 18
assert clean_extra_whitespace(elements[0].text)[:45] == EXPECTED_XLS_INITIAL_45_CLEAN_TEXT
# NOTE(crag): if the beautifulsoup4 package is installed, some (but not all) additional

View File

@ -23,6 +23,7 @@ EXPECTED_TABLE = """<table border="1" class="dataframe">
</tbody>
</table>"""
EXPECTED_TITLE = "Stanley Cups"
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"

View File

@ -1,6 +1,8 @@
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
import pytest
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT, EXPECTED_TITLE
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table
from unstructured.documents.elements import Table, Text, Title
from unstructured.partition.json import partition_json
from unstructured.partition.xlsx import partition_xlsx
from unstructured.staging.base import elements_to_json
@ -13,20 +15,21 @@ EXCEPTED_PAGE_NAME = "Stanley Cups"
def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
elements = partition_xlsx(filename=filename, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
assert sum(isinstance(element, Table) for element in elements) == 2
assert len(elements) == 4
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.page_number == 1
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
assert elements[0].metadata.page_name == EXCEPTED_PAGE_NAME
assert elements[0].metadata.filename == "stanley-cups.xlsx"
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT
assert elements[1].metadata.text_as_html == EXPECTED_TABLE
assert elements[1].metadata.page_number == 1
assert elements[1].metadata.filetype == EXPECTED_FILETYPE
assert elements[1].metadata.page_name == EXCEPTED_PAGE_NAME
assert elements[1].metadata.filename == "stanley-cups.xlsx"
def test_partition_xlsx_from_filename_with_emoji(filename="example-docs/emoji.xlsx"):
elements = partition_xlsx(filename=filename, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert sum(isinstance(element, Text) for element in elements) == 1
assert len(elements) == 1
assert clean_extra_whitespace(elements[0].text) == "🤠😅"
@ -36,14 +39,16 @@ def test_partition_xlsx_from_filename_with_metadata_filename(
):
elements = partition_xlsx(filename=filename, metadata_filename="test", include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert sum(isinstance(element, Table) for element in elements) == 2
assert sum(isinstance(element, Title) for element in elements) == 2
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT
assert elements[0].metadata.filename == "test"
def test_partition_xlsx_from_filename_with_header(filename="example-docs/stanley-cups.xlsx"):
elements = partition_xlsx(filename=filename, include_header=True)
assert all(isinstance(element, Table) for element in elements)
assert sum(isinstance(element, Table) for element in elements) == 2
assert len(elements) == 2
assert (
clean_extra_whitespace(elements[0].text)
@ -56,30 +61,31 @@ def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition_xlsx(file=f, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.page_number == 1
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
assert elements[0].metadata.page_name == EXCEPTED_PAGE_NAME
assert elements[0].metadata.filename is None
assert sum(isinstance(element, Table) for element in elements) == 2
assert len(elements) == 4
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT
assert elements[1].metadata.text_as_html == EXPECTED_TABLE
assert elements[1].metadata.page_number == 1
assert elements[1].metadata.filetype == EXPECTED_FILETYPE
assert elements[1].metadata.page_name == EXCEPTED_PAGE_NAME
assert elements[1].metadata.filename is None
def test_partition_xlsx_from_file_with_metadata_filename(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition_xlsx(file=f, metadata_filename="test", include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.filename == "test"
assert sum(isinstance(element, Table) for element in elements) == 2
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT
assert elements[1].metadata.filename == "test"
def test_partition_xlsx_from_file_with_header(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition_xlsx(file=f, include_header=True)
assert all(isinstance(element, Table) for element in elements)
assert sum(isinstance(element, Table) for element in elements) == 2
assert len(elements) == 2
assert (
clean_extra_whitespace(elements[0].text)
@ -91,25 +97,27 @@ def test_partition_xlsx_from_file_with_header(filename="example-docs/stanley-cup
def test_partition_xlsx_filename_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
elements = partition_xlsx(filename=filename, include_metadata=False, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
assert sum(isinstance(element, Table) for element in elements) == 2
assert len(elements) == 4
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html is None
assert elements[0].metadata.page_number is None
assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
assert elements[0].metadata.filename is None
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT
assert elements[1].metadata.text_as_html is None
assert elements[1].metadata.page_number is None
assert elements[1].metadata.filetype is None
assert elements[1].metadata.page_name is None
assert elements[1].metadata.filename is None
def test_partition_xlsx_from_file_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition_xlsx(file=f, include_metadata=False, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
assert sum(isinstance(element, Table) for element in elements) == 2
assert sum(isinstance(element, Title) for element in elements) == 2
assert len(elements) == 4
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html is None
assert elements[0].metadata.page_number is None
assert elements[0].metadata.filetype is None
@ -205,3 +213,19 @@ def test_partition_xlsx_with_json(filename="example-docs/stanley-cups.xlsx"):
for i in range(len(elements)):
assert elements[i] == test_elements[i]
@pytest.mark.skip("Needs to fix language detection for table. Currently detected as 'tur'")
def test_partition_xlsx_metadata_language_from_filename(filename="example-docs/stanley-cups.xlsx"):
elements = partition_xlsx(filename=filename, include_header=False)
assert sum(isinstance(element, Table) for element in elements) == 2
assert len(elements) == 4
assert elements[0].metadata.languages == ["eng"]
def test_partition_xlsx_subtables(filename="example-docs/vodafone.xlsx"):
elements = partition_xlsx(filename)
assert sum(isinstance(element, Table) for element in elements) == 3
assert len(elements) == 6

View File

@ -1,7 +1,7 @@
[
{
"type": "Table",
"element_id": "3e65b02bec20bb1056bd23a3b4ecd0f6",
"type": "Title",
"element_id": "c37e2cb941a2e20a9f728fbea5f9e400",
"metadata": {
"data_source": {
"url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d",
@ -15,15 +15,18 @@
},
"filename": "stanley-cups.xlsx",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"languages": [
"tur"
],
"page_number": 1,
"page_name": "Stanley Cups",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nStanley Cups\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
"text": "Stanley Cups"
},
{
"type": "Table",
"element_id": "0699dddf33814117e04654068f5182f6",
"element_id": "c00fc0e5ac303c40f9089791e5e485b1",
"metadata": {
"data_source": {
"url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d",
@ -37,10 +40,63 @@
},
"filename": "stanley-cups.xlsx",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"languages": [
"tur"
],
"page_number": 1,
"page_name": "Stanley Cups",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
},
{
"type": "Title",
"element_id": "98656277bdadc9ef7d1a9e1bc969579b",
"metadata": {
"data_source": {
"url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d",
"version": 1,
"record_locator": {
"server_path": "/Shared Documents/stanley-cups.xlsx",
"site_url": "https://unstructuredio.sharepoint.com"
},
"date_created": "2023-06-16T05:05:05",
"date_modified": "2023-06-16T05:05:05"
},
"filename": "stanley-cups.xlsx",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"languages": [
"tur"
],
"page_number": 2,
"page_name": "Stanley Cups Since 67",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups Since 67</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nStanley Cups Since 67\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
"text": "Stanley Cups Since 67"
},
{
"type": "Table",
"element_id": "31421b5cd94fedb10dc82738503b4505",
"metadata": {
"data_source": {
"url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d",
"version": 1,
"record_locator": {
"server_path": "/Shared Documents/stanley-cups.xlsx",
"site_url": "https://unstructuredio.sharepoint.com"
},
"date_created": "2023-06-16T05:05:05",
"date_modified": "2023-06-16T05:05:05"
},
"filename": "stanley-cups.xlsx",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"languages": [
"tur"
],
"page_number": 2,
"page_name": "Stanley Cups Since 67",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
}
]

View File

@ -1,7 +1,7 @@
[
{
"type": "Table",
"element_id": "3e65b02bec20bb1056bd23a3b4ecd0f6",
"type": "Title",
"element_id": "c37e2cb941a2e20a9f728fbea5f9e400",
"metadata": {
"data_source": {
"url": "gs://utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx",
@ -14,15 +14,18 @@
"date_modified": "2023-06-20T23:48:24.973000+00:00"
},
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"languages": [
"tur"
],
"page_number": 1,
"page_name": "Stanley Cups",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nStanley Cups\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
"text": "Stanley Cups"
},
{
"type": "Table",
"element_id": "0699dddf33814117e04654068f5182f6",
"element_id": "c00fc0e5ac303c40f9089791e5e485b1",
"metadata": {
"data_source": {
"url": "gs://utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx",
@ -35,10 +38,61 @@
"date_modified": "2023-06-20T23:48:24.973000+00:00"
},
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"languages": [
"tur"
],
"page_number": 1,
"page_name": "Stanley Cups",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
},
{
"type": "Title",
"element_id": "98656277bdadc9ef7d1a9e1bc969579b",
"metadata": {
"data_source": {
"url": "gs://utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx",
"version": "COul9MuE0/8CEAE=",
"record_locator": {
"protocol": "gs",
"remote_file_path": "utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx"
},
"date_created": "2023-06-20T23:48:24.973000+00:00",
"date_modified": "2023-06-20T23:48:24.973000+00:00"
},
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"languages": [
"tur"
],
"page_number": 2,
"page_name": "Stanley Cups Since 67",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups Since 67</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nStanley Cups Since 67\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
"text": "Stanley Cups Since 67"
},
{
"type": "Table",
"element_id": "31421b5cd94fedb10dc82738503b4505",
"metadata": {
"data_source": {
"url": "gs://utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx",
"version": "COul9MuE0/8CEAE=",
"record_locator": {
"protocol": "gs",
"remote_file_path": "utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx"
},
"date_created": "2023-06-20T23:48:24.973000+00:00",
"date_modified": "2023-06-20T23:48:24.973000+00:00"
},
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"languages": [
"tur"
],
"page_number": 2,
"page_name": "Stanley Cups Since 67",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
}
]

View File

@ -1,7 +1,7 @@
[
{
"type": "Table",
"element_id": "0e2d044a26942328e2b8647574232e7f",
"element_id": "a5c9668a6055bca2865ea5e6d16ea1e0",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
@ -14,15 +14,18 @@
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"languages": [
"eng"
],
"page_number": 1,
"page_name": "Example Test",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>MC</th>\n <th>What is 2+2?</th>\n <th>4</th>\n <th>correct</th>\n <th>3</th>\n <th>incorrect</th>\n <th>Unnamed: 6</th>\n <th>Unnamed: 7</th>\n <th>Unnamed: 8</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>MA</td>\n <td>What C datatypes are 8 bits? (assume i386)</td>\n <td>int</td>\n <td></td>\n <td>float</td>\n <td></td>\n <td>double</td>\n <td></td>\n <td>char</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>Bagpipes are awesome.</td>\n <td>true</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Rank the following in their order of operation.</td>\n <td>Parentheses</td>\n <td>Exponents</td>\n <td>Division</td>\n <td>Addition</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>The student activities fee is</td>\n <td>95</td>\n <td>dollars for students enrolled in</td>\n <td>19</td>\n <td>units or more,</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Match the lower-case greek letter with its capital form.</td>\n <td>λ</td>\n <td>Λ</td>\n <td>α</td>\n <td>γ</td>\n <td>Γ</td>\n <td>φ</td>\n <td>Φ</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>MC</td>\n <td>What is 2+2?</td>\n <td>4</td>\n <td>correct</td>\n <td>3</td>\n <td>incorrect</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>MA</td>\n <td>What C datatypes are 8 bits? (assume i386)</td>\n <td>int</td>\n <td></td>\n <td>float</td>\n <td></td>\n <td>double</td>\n <td></td>\n <td>char</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>Bagpipes are awesome.</td>\n <td>true</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Rank the following in their order of operation.</td>\n <td>Parentheses</td>\n <td>Exponents</td>\n <td>Division</td>\n <td>Addition</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>The student activities fee is</td>\n <td>95</td>\n <td>dollars for students enrolled in</td>\n <td>19</td>\n <td>units or more,</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Match the lower-case greek letter with its capital form.</td>\n <td>λ</td>\n <td>Λ</td>\n <td>α</td>\n <td>γ</td>\n <td>Γ</td>\n <td>φ</td>\n <td>Φ</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\nUnnamed: 6\nUnnamed: 7\nUnnamed: 8\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n"
"text": "\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\n\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n"
},
{
"type": "Table",
"element_id": "5c56dd4c5b649b873ebd848312e66753",
"type": "Title",
"element_id": "1d34c23ff08573afa07b42842b41277a",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
@ -35,15 +38,64 @@
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"languages": [
"eng"
],
"page_number": 2,
"page_name": "Format Abbr."
},
"text": "http://www.cmu.edu/blackboard"
},
{
"type": "Title",
"element_id": "05440c6ca94cb55f6d185d8bd92ce9d6",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
"record_locator": {
"user_pname": "devops@unstructuredio.onmicrosoft.com",
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls"
},
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43"
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"languages": [
"eng"
],
"page_number": 2,
"page_name": "Format Abbr."
},
"text": "Question Format Abbreviations"
},
{
"type": "Table",
"element_id": "e39c724f1b09a4c3286b6368538e05fc",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
"record_locator": {
"user_pname": "devops@unstructuredio.onmicrosoft.com",
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls"
},
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43"
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"languages": [
"eng"
],
"page_number": 2,
"page_name": "Format Abbr.",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Unnamed: 0</th>\n <th>Unnamed: 1</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard</td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>Question Format Abbreviations</td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>Abbreviation</td>\n <td>Question Type</td>\n </tr>\n <tr>\n <td>MC</td>\n <td>Multiple Choice</td>\n </tr>\n <tr>\n <td>MA</td>\n <td>Multiple Answer</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>True/False</td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>Essay</td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Ordering</td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Matching</td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>Fill in the Blank</td>\n </tr>\n <tr>\n <td>FIL</td>\n <td>File response</td>\n </tr>\n <tr>\n <td>NUM</td>\n <td>Numeric Response</td>\n </tr>\n <tr>\n <td>SR</td>\n <td>Short response</td>\n </tr>\n <tr>\n <td>OP</td>\n <td>Opinion</td>\n </tr>\n <tr>\n <td>FIB_PLUS</td>\n <td>Multiple Fill in the Blank</td>\n </tr>\n <tr>\n <td>JUMBLED_SENTENCE</td>\n <td>Jumbled Sentence</td>\n </tr>\n <tr>\n <td>QUIZ_BOWL</td>\n <td>Quiz Bowl</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Abbreviation</td>\n <td>Question Type</td>\n </tr>\n <tr>\n <td>MC</td>\n <td>Multiple Choice</td>\n </tr>\n <tr>\n <td>MA</td>\n <td>Multiple Answer</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>True/False</td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>Essay</td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Ordering</td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Matching</td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>Fill in the Blank</td>\n </tr>\n <tr>\n <td>FIL</td>\n <td>File response</td>\n </tr>\n <tr>\n <td>NUM</td>\n <td>Numeric Response</td>\n </tr>\n <tr>\n <td>SR</td>\n <td>Short response</td>\n </tr>\n <tr>\n <td>OP</td>\n <td>Opinion</td>\n </tr>\n <tr>\n <td>FIB_PLUS</td>\n <td>Multiple Fill in the Blank</td>\n </tr>\n <tr>\n <td>JUMBLED_SENTENCE</td>\n <td>Jumbled Sentence</td>\n </tr>\n <tr>\n <td>QUIZ_BOWL</td>\n <td>Quiz Bowl</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nUnnamed: 0\nUnnamed: 1\n\n\n\n\n\n\n\n\n\n\n\n\nhttp://www.cmu.edu/blackboard\n\n\n\n\n\n\n\nQuestion Format Abbreviations\n\n\n\n\n\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n"
"text": "\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n"
},
{
"type": "Table",
"element_id": "f48657c4eb70d98975e567248d0ef4bb",
"type": "Title",
"element_id": "1d34c23ff08573afa07b42842b41277a",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
@ -56,10 +108,291 @@
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"languages": [
"eng"
],
"page_number": 3,
"page_name": "Readme",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Unnamed: 0</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>File Information</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Source</td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Version</td>\n </tr>\n <tr>\n <td>1.0 (January 2012)</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Contact</td>\n </tr>\n <tr>\n <td>bb-help@andrew.cmu.edu</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>About</td>\n </tr>\n <tr>\n <td>This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions</td>\n </tr>\n </tbody>\n</table>"
"page_name": "Readme"
},
"text": "\n\n\nUnnamed: 0\n\n\n\n\n\n\n\n\n\n\nhttp://www.cmu.edu/blackboard\n\n\n\n\n\nFile Information\n\n\n\n\n\n\n\n\nSource\n\n\nhttp://www.cmu.edu/blackboard/files/evaluate/tests-example.xls\n\n\n\n\n\n\n\n\nVersion\n\n\n1.0 (January 2012)\n\n\n\n\n\n\n\n\nContact\n\n\nbb-help@andrew.cmu.edu\n\n\n\n\n\n\n\n\nAbout\n\n\nThis is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions\n\n\n"
"text": "http://www.cmu.edu/blackboard"
},
{
"type": "Title",
"element_id": "85ada878f2345c23b8a74a931d2e20a4",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
"record_locator": {
"user_pname": "devops@unstructuredio.onmicrosoft.com",
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls"
},
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43"
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"languages": [
"eng"
],
"page_number": 3,
"page_name": "Readme"
},
"text": "File Information"
},
{
"type": "Title",
"element_id": "0e570ca6fabe24f94e52c1833f3ffd25",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
"record_locator": {
"user_pname": "devops@unstructuredio.onmicrosoft.com",
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls"
},
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43"
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"languages": [
"eng"
],
"page_number": 3,
"page_name": "Readme"
},
"text": "Source"
},
{
"type": "Title",
"element_id": "4cf4ff5597274d0c1ce8ae5a17ead4df",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
"record_locator": {
"user_pname": "devops@unstructuredio.onmicrosoft.com",
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls"
},
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43"
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"languages": [
"eng"
],
"page_number": 3,
"page_name": "Readme"
},
"text": "http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls"
},
{
"type": "Title",
"element_id": "4cf4ff5597274d0c1ce8ae5a17ead4df",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
"record_locator": {
"user_pname": "devops@unstructuredio.onmicrosoft.com",
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls"
},
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43"
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel"
},
"text": "http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls"
},
{
"type": "Title",
"element_id": "dd167905de0defcaf72de673ee44c074",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
"record_locator": {
"user_pname": "devops@unstructuredio.onmicrosoft.com",
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls"
},
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43"
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"languages": [
"eng"
],
"page_number": 3,
"page_name": "Readme"
},
"text": "Version"
},
{
"type": "UncategorizedText",
"element_id": "5f9d7b40d332fef76efdd0a97bcb8617",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
"record_locator": {
"user_pname": "devops@unstructuredio.onmicrosoft.com",
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls"
},
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43"
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"languages": [
"eng"
],
"page_number": 3,
"page_name": "Readme"
},
"text": "1.0 (January 2012)"
},
{
"type": "UncategorizedText",
"element_id": "5f9d7b40d332fef76efdd0a97bcb8617",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
"record_locator": {
"user_pname": "devops@unstructuredio.onmicrosoft.com",
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls"
},
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43"
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel"
},
"text": "1.0 (January 2012)"
},
{
"type": "Title",
"element_id": "2b5c3d26721ae9c350cf3009318b626f",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
"record_locator": {
"user_pname": "devops@unstructuredio.onmicrosoft.com",
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls"
},
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43"
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"languages": [
"eng"
],
"page_number": 3,
"page_name": "Readme"
},
"text": "Contact"
},
{
"type": "Title",
"element_id": "53d2273ac70fc31640cc45af840dbd42",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
"record_locator": {
"user_pname": "devops@unstructuredio.onmicrosoft.com",
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls"
},
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43"
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"languages": [
"eng"
],
"page_number": 3,
"page_name": "Readme"
},
"text": "bb-help@andrew.cmu.edu"
},
{
"type": "Title",
"element_id": "53d2273ac70fc31640cc45af840dbd42",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
"record_locator": {
"user_pname": "devops@unstructuredio.onmicrosoft.com",
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls"
},
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43"
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel"
},
"text": "bb-help@andrew.cmu.edu"
},
{
"type": "Title",
"element_id": "4efca0d10c5feb8e9b35eb1d994f2905",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
"record_locator": {
"user_pname": "devops@unstructuredio.onmicrosoft.com",
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls"
},
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43"
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"languages": [
"eng"
],
"page_number": 3,
"page_name": "Readme"
},
"text": "About"
},
{
"type": "NarrativeText",
"element_id": "4c9720f1540cc84d33e30e09aca8c077",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
"record_locator": {
"user_pname": "devops@unstructuredio.onmicrosoft.com",
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls"
},
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43"
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"languages": [
"eng"
],
"page_number": 3,
"page_name": "Readme"
},
"text": "This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions"
},
{
"type": "NarrativeText",
"element_id": "4c9720f1540cc84d33e30e09aca8c077",
"metadata": {
"data_source": {
"url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls",
"record_locator": {
"user_pname": "devops@unstructuredio.onmicrosoft.com",
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls"
},
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43"
},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel"
},
"text": "This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions"
}
]

View File

@ -1,14 +1,20 @@
from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast
from typing import IO, Any, BinaryIO, Dict, List, Optional, Tuple, Union, cast
import numpy as np
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
from unstructured.chunking.title import add_chunking_strategy
from unstructured.cleaners.core import clean_bullets
from unstructured.documents.elements import (
Element,
ElementMetadata,
ListItem,
NarrativeText,
Table,
Text,
Title,
process_metadata,
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
@ -18,6 +24,13 @@ from unstructured.partition.common import (
get_last_modified_date_from_file,
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.lang import detect_languages
from unstructured.partition.text_type import (
is_bulleted_text,
is_possible_narrative_text,
is_possible_numbered_list,
is_possible_title,
)
@process_metadata()
@ -28,8 +41,10 @@ def partition_xlsx(
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
languages: List[str] = ["auto"],
metadata_last_modified: Optional[str] = None,
include_header: bool = True,
include_header: bool = False,
find_subtable: bool = True,
**kwargs,
) -> List[Element]:
"""Partitions Microsoft Excel Documents in .xlsx format into its document elements.
@ -42,43 +57,340 @@ def partition_xlsx(
A file-like object using "rb" mode --> open(filename, "rb").
include_metadata
Determines whether or not metadata is included in the output.
languages
The list of languages present in the document.
metadata_last_modified
The day of the last modification
include_header
Determines whether or not header info info is included in text and medatada.text_as_html
"""
exactly_one(filename=filename, file=file)
if not isinstance(languages, list):
raise TypeError(
'The language parameter must be a list of language codes as strings, ex. ["eng"]',
)
last_modification_date = None
header = 0 if include_header else None
if filename:
sheets = pd.read_excel(filename, sheet_name=None)
sheets = pd.read_excel(filename, sheet_name=None, header=header)
last_modification_date = get_last_modified_date(filename)
elif file:
f = spooled_to_bytes_io_if_needed(
cast(Union[BinaryIO, SpooledTemporaryFile], file),
)
sheets = pd.read_excel(f, sheet_name=None)
sheets = pd.read_excel(f, sheet_name=None, header=header)
last_modification_date = get_last_modified_date_from_file(file)
elements: List[Element] = []
page_number = 0
for sheet_name, table in sheets.items():
for sheet_name, sheet in sheets.items():
page_number += 1
html_text = table.to_html(index=False, header=include_header, na_rep="")
text = soupparser_fromstring(html_text).text_content()
if not find_subtable:
html_text = sheet.to_html(index=False, header=include_header, na_rep="")
text = soupparser_fromstring(html_text).text_content()
if include_metadata:
metadata = ElementMetadata(
text_as_html=html_text,
page_name=sheet_name,
page_number=page_number,
filename=metadata_filename or filename,
last_modified=metadata_last_modified or last_modification_date,
)
if include_metadata:
metadata = ElementMetadata(
text_as_html=html_text,
page_name=sheet_name,
page_number=page_number,
filename=metadata_filename or filename,
last_modified=metadata_last_modified or last_modification_date,
)
else:
metadata = ElementMetadata()
table = Table(text=text, metadata=metadata)
elements.append(table)
else:
metadata = ElementMetadata()
_connected_components = _get_connected_components(sheet)
for _connected_component, _min_max_coords in _connected_components:
min_x, min_y, max_x, max_y = _min_max_coords
table = Table(text=text, metadata=metadata)
elements.append(table)
subtable = sheet.iloc[min_x : max_x + 1, min_y : max_y + 1] # noqa: E203
single_non_empty_rows, single_non_empty_row_contents = _single_non_empty_rows(
subtable,
)
(
front_non_consecutive,
last_non_consecutive,
) = _find_first_and_last_non_consecutive_row(
single_non_empty_rows,
subtable.shape,
)
metadata = _get_metadata(
include_metadata,
sheet_name,
page_number,
metadata_filename or filename,
metadata_last_modified or last_modification_date,
)
# NOTE(klaijan) - need to explicitly define the condition to avoid the case of 0
if front_non_consecutive is not None and last_non_consecutive is not None:
first_row = int(front_non_consecutive - max_x)
last_row = int(max_x - last_non_consecutive)
subtable = _get_sub_subtable(subtable, (first_row, last_row))
if front_non_consecutive is not None:
for content in single_non_empty_row_contents[: front_non_consecutive + 1]:
languages = detect_languages(str(content), languages)
element = _check_content_element_type(str(content))
element.metadata = metadata
element.metadata.languages = languages
elements.append(element)
if subtable is not None and len(subtable) == 1:
element = _check_content_element_type(str(subtable.iloc[0].values[0]))
elements.append(element)
elif subtable is not None:
# parse subtables as html
html_text = subtable.to_html(index=False, header=include_header, na_rep="")
text = soupparser_fromstring(html_text).text_content()
languages = detect_languages(text, languages)
subtable = Table(text=text)
subtable.metadata = metadata
subtable.metadata.text_as_html = html_text
subtable.metadata.languages = languages
elements.append(subtable)
if front_non_consecutive is not None and last_non_consecutive is not None:
for content in single_non_empty_row_contents[
front_non_consecutive + 1 : # noqa: E203
]:
languages = detect_languages(str(content), languages)
element = _check_content_element_type(str(content))
element.metadata = metadata
element.metadata.languages = languages
elements.append(element)
return elements
def _get_connected_components(
sheet: pd.DataFrame,
filter: bool = True,
):
"""
Identify connected components of non-empty cells in an excel sheet.
Args:
sheet: an excel sheet read in DataFrame.
filter (bool, optional): If True (default), filters out overlapping components
to return distinct components.
Returns:
A list of tuples, each containing:
- A list of tuples representing the connected component's cell coordinates.
- A tuple with the min and max x and y coordinates bounding the connected component.
Note:
This function performs a depth-first search (DFS) to identify connected components of
non-empty cells in the sheet. If 'filter' is set to True, it also filters out
overlapping components to return distinct components.
"""
max_row, max_col = sheet.shape
visited = set()
connected_components = []
def dfs(row, col, component):
if (
row < 0
or row >= sheet.shape[0]
or col < 0
or col >= sheet.shape[1]
or (row, col) in visited
):
return
visited.add((row, col))
if not pd.isna(sheet.iat[row, col]):
component.append((row, col))
# Explore neighboring cells
dfs(row - 1, col, component) # Above
dfs(row + 1, col, component) # Below
dfs(row, col - 1, component) # Left
dfs(row, col + 1, component) # Right
for row in range(max_row):
for col in range(max_col):
if (row, col) not in visited and not pd.isna(sheet.iat[row, col]):
component: List[dict] = []
dfs(row, col, component)
min_x, min_y, max_x, max_y = _find_min_max_coord(component)
connected_components.append(
{
"component": component,
"min_x": min_x,
"min_y": min_y,
"max_x": max_x,
"max_y": max_y,
},
)
if filter:
connected_components = _filter_overlapping_tables(connected_components)
return [
(
connected_component["component"],
(
connected_component["min_x"],
connected_component["min_y"],
connected_component["max_x"],
connected_component["max_y"],
),
)
for connected_component in connected_components
]
def _filter_overlapping_tables(
connected_components: List[Dict[Any, Any]],
) -> List[Dict[Any, Any]]:
"""
Filter out overlapping connected components to return distinct components.
"""
sorted_components = sorted(connected_components, key=lambda x: x["min_x"])
merged_components: List[dict] = []
current_component = None
for component in sorted_components:
if current_component is None:
current_component = component
else:
# Check if component overlaps with the current_component
if component["min_x"] <= current_component["max_x"]:
# Merge the components and update min_x, max_x
current_component["component"].extend(component["component"])
current_component["min_x"] = min(current_component["min_x"], component["min_x"])
current_component["max_x"] = max(current_component["max_x"], component["max_x"])
current_component["min_y"] = min(current_component["min_y"], component["min_y"])
current_component["max_y"] = max(current_component["max_y"], component["max_y"])
else:
# No overlap, add the current_component to the merged list
merged_components.append(current_component)
# Update the current_component
current_component = component
# Append the last current_component to the merged list
if current_component is not None:
merged_components.append(current_component)
return merged_components
def _find_min_max_coord(
connected_component: List[Dict[Any, Any]],
) -> Tuple[Union[int, float], Union[int, float], Union[int, float], Union[int, float]]:
"""
Find the minimum and maximum coordinates (bounding box) of a connected component.
"""
min_x, min_y, max_x, max_y = float("inf"), float("inf"), float("-inf"), float("-inf")
for _x, _y in connected_component:
if _x < min_x:
min_x = _x
if _y < min_y:
min_y = _y
if _x > max_x:
max_x = _x
if _y > max_y:
max_y = _y
return min_x, min_y, max_x, max_y
def _get_sub_subtable(subtable: pd.DataFrame, first_and_last_row: Tuple[int, int]) -> pd.DataFrame:
"""
Extract a sub-subtable from a given subtable based on the first and last row range.
"""
# TODO(klaijan) - to further check for sub subtable, we could check whether
# two consecutive rows contains full row of cells.
# if yes, it might not be a header. We should check the length.
first_row, last_row = first_and_last_row
if last_row == first_row:
return None
return subtable.iloc[first_row : last_row + 1] # noqa: E203
def _find_first_and_last_non_consecutive_row(
row_indices: List[int],
table_shape: Tuple[int, int],
) -> Tuple[Optional[int], Optional[int]]:
"""
Find the indices of the first and last non-consecutive rows in a list of row indices.
"""
# If the table is a single column with one or more rows
table_rows, table_cols = table_shape
if len(row_indices) == 1 or (len(row_indices) == table_rows and table_cols == 1):
return row_indices[0], row_indices[0]
arr = np.array(row_indices)
front_non_consecutive = next(
(i for i, (x, y) in enumerate(zip(arr, arr[1:])) if x + 1 != y),
None,
)
reversed_arr = arr[::-1] # Reverse the array
last_non_consecutive = next(
(i for i, (x, y) in enumerate(zip(reversed_arr, reversed_arr[1:])) if x - 1 != y),
None,
)
return front_non_consecutive, last_non_consecutive
def _single_non_empty_rows(subtable) -> Tuple[List[int], List[str]]:
"""
Identify single non-empty rows in a subtable and extract their row indices and contents.
"""
single_non_empty_rows = []
single_non_empty_row_contents = []
for index, row in subtable.iterrows():
if row.count() == 1:
single_non_empty_rows.append(index)
single_non_empty_row_contents.append(row.dropna().iloc[0])
return single_non_empty_rows, single_non_empty_row_contents
def _check_content_element_type(text: str) -> Element:
"""
Classify the type of content element based on its text.
"""
if is_bulleted_text(text):
return ListItem(
text=clean_bullets(text),
)
elif is_possible_numbered_list(text):
return ListItem(
text=text,
)
elif is_possible_narrative_text(text):
return NarrativeText(
text=text,
)
elif is_possible_title(text):
return Title(
text=text,
)
else:
return Text(
text=text,
)
def _get_metadata(
include_metadata: bool = True,
sheet_name: Optional[str] = None,
page_number: Optional[int] = -1,
filename: Optional[str] = None,
last_modification_date: Union[str, None] = None,
) -> ElementMetadata:
"""Returns metadata depending on `include_metadata` flag"""
if include_metadata:
metadata = ElementMetadata(
page_name=sheet_name,
page_number=page_number,
filename=filename,
last_modified=last_modification_date,
)
else:
metadata = ElementMetadata()
return metadata