2024-05-22 17:51:08 -07:00
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
2023-04-17 23:11:21 -07:00
|
|
|
|
import json
|
2023-01-09 16:15:14 -05:00
|
|
|
|
import os
|
|
|
|
|
import pathlib
|
2024-01-04 09:52:00 -08:00
|
|
|
|
import tempfile
|
2023-01-27 12:08:18 -05:00
|
|
|
|
import warnings
|
2023-05-15 13:23:19 -05:00
|
|
|
|
from importlib import import_module
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
|
from unittest.mock import Mock, patch
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
import docx
|
2023-02-27 17:30:54 +01:00
|
|
|
|
import pytest
|
2024-01-17 17:50:36 -05:00
|
|
|
|
from PIL import Image
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
2024-01-04 09:52:00 -08:00
|
|
|
|
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
|
fix: stop csv and tsv dropping the first line of the file (#1530)
The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.
Here is a snippet of code that demonstrates the current behavior and the
proposed fix
```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
c1 = """
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
"""
f = "./test.csv"
with open(f, 'w') as ff:
ff.write(c1)
print("Suggested Improvement Keep First Line")
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
print("\n\nOriginal Looses First Line")
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```
---------
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-17 00:59:35 +02:00
|
|
|
|
from test_unstructured.partition.test_constants import (
|
|
|
|
|
EXPECTED_TABLE,
|
|
|
|
|
EXPECTED_TABLE_XLSX,
|
|
|
|
|
EXPECTED_TEXT,
|
|
|
|
|
EXPECTED_TEXT_XLSX,
|
|
|
|
|
EXPECTED_TITLE,
|
|
|
|
|
)
|
2023-09-11 16:00:14 -05:00
|
|
|
|
from unstructured.chunking.title import chunk_by_title
|
2023-05-16 15:40:40 -04:00
|
|
|
|
from unstructured.cleaners.core import clean_extra_whitespace
|
2023-02-27 17:30:54 +01:00
|
|
|
|
from unstructured.documents.elements import (
|
|
|
|
|
Address,
|
2023-05-15 13:23:19 -05:00
|
|
|
|
ElementMetadata,
|
2023-02-27 17:30:54 +01:00
|
|
|
|
ListItem,
|
|
|
|
|
NarrativeText,
|
2023-05-16 15:40:40 -04:00
|
|
|
|
Table,
|
2023-10-03 09:40:34 -07:00
|
|
|
|
TableChunk,
|
2023-02-27 17:30:54 +01:00
|
|
|
|
Text,
|
|
|
|
|
Title,
|
|
|
|
|
)
|
2023-05-15 13:23:19 -05:00
|
|
|
|
from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType
|
2023-02-27 17:30:54 +01:00
|
|
|
|
from unstructured.partition import auto
|
2023-08-21 23:00:21 -04:00
|
|
|
|
from unstructured.partition.auto import _get_partition_with_extras, partition
|
2023-02-17 09:30:23 -05:00
|
|
|
|
from unstructured.partition.common import convert_office_doc
|
2023-11-15 21:41:02 -08:00
|
|
|
|
from unstructured.partition.utils.constants import PartitionStrategy
|
2023-04-17 23:11:21 -07:00
|
|
|
|
from unstructured.staging.base import elements_to_json
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
2023-01-23 12:03:09 -05:00
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
|
|
|
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
EXPECTED_EMAIL_OUTPUT = [
|
|
|
|
|
NarrativeText(text="This is a test email to use for unit tests."),
|
|
|
|
|
Title(text="Important points:"),
|
|
|
|
|
ListItem(text="Roses are red"),
|
|
|
|
|
ListItem(text="Violets are blue"),
|
|
|
|
|
]
|
|
|
|
|
|
2023-06-16 17:52:13 -07:00
|
|
|
|
EML_TEST_FILE = "eml/fake-email.eml"
|
|
|
|
|
|
2023-03-30 16:54:29 -04:00
|
|
|
|
is_in_docker = os.path.exists("/.dockerenv")
|
|
|
|
|
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
def test_auto_partition_email_from_filename():
|
2023-06-16 17:52:13 -07:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
assert elements == EXPECTED_EMAIL_OUTPUT
|
2023-05-12 11:33:01 -04:00
|
|
|
|
assert elements[0].metadata.filename == os.path.basename(filename)
|
2023-05-15 18:25:39 -04:00
|
|
|
|
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_email_from_file():
|
2023-06-16 17:52:13 -07:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
|
2023-02-27 17:30:54 +01:00
|
|
|
|
with open(filename) as f:
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
assert elements == EXPECTED_EMAIL_OUTPUT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_email_from_file_rb():
|
2023-06-16 17:52:13 -07:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
with open(filename, "rb") as f:
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
assert elements == EXPECTED_EMAIL_OUTPUT
|
|
|
|
|
|
|
|
|
|
|
2023-02-27 17:30:54 +01:00
|
|
|
|
@pytest.fixture()
|
2023-01-09 16:15:14 -05:00
|
|
|
|
def mock_docx_document():
|
|
|
|
|
document = docx.Document()
|
|
|
|
|
|
|
|
|
|
document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
|
|
|
|
|
# NOTE(robinson) - this should get picked up as a list item due to the •
|
|
|
|
|
document.add_paragraph("• Parrots", style="Normal")
|
|
|
|
|
document.add_paragraph("Hockey", style="List Bullet")
|
|
|
|
|
# NOTE(robinson) - this should get picked up as a title
|
|
|
|
|
document.add_paragraph("Analysis", style="Normal")
|
|
|
|
|
# NOTE(robinson) - this should get dropped because it is empty
|
|
|
|
|
document.add_paragraph("", style="Normal")
|
|
|
|
|
# NOTE(robinson) - this should get picked up as a narrative text
|
|
|
|
|
document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
|
|
|
|
|
document.add_paragraph("This is my third thought.", style="Body Text")
|
|
|
|
|
# NOTE(robinson) - this should just be regular text
|
|
|
|
|
document.add_paragraph("2023")
|
|
|
|
|
|
|
|
|
|
return document
|
|
|
|
|
|
|
|
|
|
|
2023-02-27 17:30:54 +01:00
|
|
|
|
@pytest.fixture()
|
2023-01-09 16:15:14 -05:00
|
|
|
|
def expected_docx_elements():
|
|
|
|
|
return [
|
|
|
|
|
Title("These are a few of my favorite things:"),
|
|
|
|
|
ListItem("Parrots"),
|
|
|
|
|
ListItem("Hockey"),
|
|
|
|
|
Title("Analysis"),
|
|
|
|
|
NarrativeText("This is my first thought. This is my second thought."),
|
|
|
|
|
NarrativeText("This is my third thought."),
|
|
|
|
|
Text("2023"),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_docx_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
|
|
|
|
|
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
|
|
|
|
mock_docx_document.save(filename)
|
|
|
|
|
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
assert elements == expected_docx_elements
|
2023-05-12 11:33:01 -04:00
|
|
|
|
assert elements[0].metadata.filename == os.path.basename(filename)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_elements, tmpdir):
|
|
|
|
|
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
|
|
|
|
mock_docx_document.save(filename)
|
|
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
assert elements == expected_docx_elements
|
|
|
|
|
|
|
|
|
|
|
2024-05-22 17:51:08 -07:00
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
2023-03-24 16:32:45 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-08-24 03:02:47 -04:00
|
|
|
|
("pass_metadata_filename", "content_type"),
|
2023-03-24 16:32:45 -07:00
|
|
|
|
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
|
|
|
|
|
)
|
|
|
|
|
def test_auto_partition_doc_with_filename(
|
|
|
|
|
mock_docx_document,
|
|
|
|
|
expected_docx_elements,
|
2024-05-22 17:51:08 -07:00
|
|
|
|
tmp_path: pathlib.Path,
|
2023-08-24 03:02:47 -04:00
|
|
|
|
pass_metadata_filename,
|
2023-03-24 16:32:45 -07:00
|
|
|
|
content_type,
|
|
|
|
|
):
|
2024-05-22 17:51:08 -07:00
|
|
|
|
docx_file_path = str(tmp_path / "mock_document.docx")
|
|
|
|
|
doc_file_path = str(tmp_path / "mock_document.doc")
|
|
|
|
|
mock_docx_document.save(docx_file_path)
|
|
|
|
|
convert_office_doc(docx_file_path, str(tmp_path), "doc")
|
|
|
|
|
metadata_filename = doc_file_path if pass_metadata_filename else None
|
2023-03-24 16:32:45 -07:00
|
|
|
|
elements = partition(
|
2024-05-22 17:51:08 -07:00
|
|
|
|
filename=doc_file_path,
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename=metadata_filename,
|
2023-03-24 16:32:45 -07:00
|
|
|
|
content_type=content_type,
|
2023-11-15 21:41:02 -08:00
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-03-24 16:32:45 -07:00
|
|
|
|
)
|
2023-02-17 09:30:23 -05:00
|
|
|
|
assert elements == expected_docx_elements
|
2023-05-12 11:33:01 -04:00
|
|
|
|
assert elements[0].metadata.filename == "mock_document.doc"
|
2024-05-22 17:51:08 -07:00
|
|
|
|
assert elements[0].metadata.file_directory == str(tmp_path)
|
2023-02-17 09:30:23 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
|
|
|
|
|
# determine that the file is an .doc document
|
2023-02-27 17:30:54 +01:00
|
|
|
|
@pytest.mark.xfail()
|
2023-02-17 09:30:23 -05:00
|
|
|
|
def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements, tmpdir):
|
|
|
|
|
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
|
|
|
|
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
|
|
|
|
mock_docx_document.save(docx_filename)
|
|
|
|
|
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
|
|
|
|
|
|
|
|
|
with open(doc_filename, "rb") as f:
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
2023-02-17 09:30:23 -05:00
|
|
|
|
assert elements == expected_docx_elements
|
|
|
|
|
|
|
|
|
|
|
2023-03-24 16:32:45 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-08-24 03:02:47 -04:00
|
|
|
|
("pass_metadata_filename", "content_type"),
|
2023-03-24 16:32:45 -07:00
|
|
|
|
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
|
|
|
|
)
|
2023-08-24 03:02:47 -04:00
|
|
|
|
def test_auto_partition_html_from_filename(pass_metadata_filename, content_type):
|
2023-01-23 12:03:09 -05:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename = filename if pass_metadata_filename else None
|
2023-05-15 15:26:53 -04:00
|
|
|
|
elements = partition(
|
|
|
|
|
filename=filename,
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename=metadata_filename,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
content_type=content_type,
|
2023-11-15 21:41:02 -08:00
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
assert len(elements) > 0
|
2023-05-12 11:33:01 -04:00
|
|
|
|
assert elements[0].metadata.filename == os.path.basename(filename)
|
2023-05-15 18:25:39 -04:00
|
|
|
|
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
|
2023-03-24 16:32:45 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-08-24 03:02:47 -04:00
|
|
|
|
("pass_metadata_filename", "content_type"),
|
2023-03-24 16:32:45 -07:00
|
|
|
|
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
|
|
|
|
)
|
2023-08-24 03:02:47 -04:00
|
|
|
|
def test_auto_partition_html_from_file(pass_metadata_filename, content_type):
|
2023-01-23 12:03:09 -05:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename = filename if pass_metadata_filename else None
|
2023-02-27 17:30:54 +01:00
|
|
|
|
with open(filename) as f:
|
2023-05-15 15:26:53 -04:00
|
|
|
|
elements = partition(
|
|
|
|
|
file=f,
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename=metadata_filename,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
content_type=content_type,
|
2023-11-15 21:41:02 -08:00
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_html_from_file_rb():
|
2023-01-23 12:03:09 -05:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
|
2023-01-09 16:15:14 -05:00
|
|
|
|
with open(filename, "rb") as f:
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
|
|
|
|
|
|
2024-05-15 18:53:15 -04:00
|
|
|
|
# NOTE(robinson) - skipping this test with docker image to avoid putting the
|
|
|
|
|
# test fixtures into the image
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
2024-04-24 09:05:20 +02:00
|
|
|
|
def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements():
|
2023-04-17 23:11:21 -07:00
|
|
|
|
"""Test auto-processing an unstructured json output file by filename."""
|
2024-04-24 09:05:20 +02:00
|
|
|
|
original_file_name = "spring-weather.html"
|
|
|
|
|
json_file_path = (
|
|
|
|
|
pathlib.Path(DIRECTORY).parents[1]
|
|
|
|
|
/ "test_unstructured_ingest"
|
|
|
|
|
/ "expected-structured-output"
|
|
|
|
|
/ "azure"
|
|
|
|
|
/ f"{original_file_name}.json"
|
2023-04-17 23:11:21 -07:00
|
|
|
|
)
|
2024-04-24 09:05:20 +02:00
|
|
|
|
with open(json_file_path) as json_f:
|
|
|
|
|
expected_result = json.load(json_f)
|
|
|
|
|
|
|
|
|
|
partitioning_result = json.loads(
|
|
|
|
|
elements_to_json(
|
|
|
|
|
partition(
|
|
|
|
|
filename=json_file_path,
|
|
|
|
|
# -- use the original file name to get the same element IDs (hashes) --
|
|
|
|
|
metadata_filename=original_file_name,
|
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
|
|
|
|
)
|
|
|
|
|
)
|
2023-11-15 21:41:02 -08:00
|
|
|
|
)
|
2024-04-24 09:05:20 +02:00
|
|
|
|
for elem in partitioning_result:
|
2023-05-15 13:23:19 -05:00
|
|
|
|
elem.pop("metadata")
|
2024-04-24 09:05:20 +02:00
|
|
|
|
for elem in expected_result:
|
2023-05-15 13:23:19 -05:00
|
|
|
|
elem.pop("metadata")
|
2024-04-24 09:05:20 +02:00
|
|
|
|
assert expected_result == partitioning_result
|
2023-04-17 23:11:21 -07:00
|
|
|
|
|
|
|
|
|
|
2023-07-25 15:59:45 -04:00
|
|
|
|
def test_auto_partition_json_raises_with_unprocessable_json(tmpdir):
|
|
|
|
|
# NOTE(robinson) - This is unprocessable because it is not a list of dicts,
|
|
|
|
|
# per the Unstructured ISD format
|
|
|
|
|
text = '{"hi": "there"}'
|
|
|
|
|
|
|
|
|
|
filename = os.path.join(tmpdir, "unprocessable.json")
|
|
|
|
|
with open(filename, "w") as f:
|
|
|
|
|
f.write(text)
|
|
|
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
|
partition(filename=filename)
|
|
|
|
|
|
|
|
|
|
|
2023-04-17 23:11:21 -07:00
|
|
|
|
@pytest.mark.xfail(
|
|
|
|
|
reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492",
|
|
|
|
|
)
|
|
|
|
|
def test_auto_partition_json_from_file():
|
|
|
|
|
"""Test auto-processing an unstructured json output file by file handle."""
|
|
|
|
|
filename = os.path.join(
|
|
|
|
|
EXAMPLE_DOCS_DIRECTORY,
|
|
|
|
|
"..",
|
|
|
|
|
"test_unstructured_ingest",
|
|
|
|
|
"expected-structured-output",
|
|
|
|
|
"azure-blob-storage",
|
|
|
|
|
"spring-weather.html.json",
|
|
|
|
|
)
|
|
|
|
|
with open(filename) as json_f:
|
|
|
|
|
json_data = json.load(json_f)
|
|
|
|
|
with open(filename, encoding="utf-8") as partition_f:
|
2023-11-15 21:41:02 -08:00
|
|
|
|
json_elems = json.loads(
|
|
|
|
|
elements_to_json(partition(file=partition_f, strategy=PartitionStrategy.HI_RES))
|
|
|
|
|
)
|
2023-04-17 23:11:21 -07:00
|
|
|
|
for elem in json_elems:
|
|
|
|
|
# coordinates are always in the element data structures, even if None
|
|
|
|
|
elem.pop("coordinates")
|
2023-06-20 11:19:55 -05:00
|
|
|
|
elem.pop("coordinate_system")
|
2023-04-17 23:11:21 -07:00
|
|
|
|
assert json_data == json_elems
|
|
|
|
|
|
|
|
|
|
|
2023-01-13 16:39:53 -05:00
|
|
|
|
EXPECTED_TEXT_OUTPUT = [
|
|
|
|
|
NarrativeText(text="This is a test document to use for unit tests."),
|
2023-01-26 10:52:25 -05:00
|
|
|
|
Address(text="Doylestown, PA 18901"),
|
2023-01-13 16:39:53 -05:00
|
|
|
|
Title(text="Important points:"),
|
|
|
|
|
ListItem(text="Hamburgers are delicious"),
|
|
|
|
|
ListItem(text="Dogs are the best"),
|
|
|
|
|
ListItem(text="I love fuzzy blankets"),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_text_from_filename():
|
2023-01-23 12:03:09 -05:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
2023-01-13 16:39:53 -05:00
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
assert elements == EXPECTED_TEXT_OUTPUT
|
2023-05-12 11:33:01 -04:00
|
|
|
|
assert elements[0].metadata.filename == os.path.basename(filename)
|
2023-05-15 18:25:39 -04:00
|
|
|
|
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
|
2023-01-13 16:39:53 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_text_from_file():
|
2023-01-23 12:03:09 -05:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
2023-02-27 17:30:54 +01:00
|
|
|
|
with open(filename) as f:
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
2023-01-13 16:39:53 -05:00
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
assert elements == EXPECTED_TEXT_OUTPUT
|
2023-01-27 12:08:18 -05:00
|
|
|
|
|
|
|
|
|
|
2023-03-24 16:32:45 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-08-24 03:02:47 -04:00
|
|
|
|
("pass_metadata_filename", "content_type"),
|
2023-03-24 16:32:45 -07:00
|
|
|
|
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
|
|
|
|
)
|
2023-08-24 03:02:47 -04:00
|
|
|
|
def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type, request):
|
2023-01-23 12:03:09 -05:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename = filename if pass_metadata_filename else None
|
2023-03-24 16:32:45 -07:00
|
|
|
|
|
2023-05-15 15:26:53 -04:00
|
|
|
|
elements = partition(
|
|
|
|
|
filename=filename,
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename=metadata_filename,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
content_type=content_type,
|
2023-11-15 21:41:02 -08:00
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
)
|
2023-01-19 09:29:28 -05:00
|
|
|
|
|
2024-05-15 18:53:15 -04:00
|
|
|
|
# NOTE(alan): Xfail since new model skips the word Zejiang
|
|
|
|
|
request.applymarker(pytest.mark.xfail)
|
|
|
|
|
|
2023-10-05 00:41:38 -07:00
|
|
|
|
idx = 3
|
|
|
|
|
assert isinstance(elements[idx], Title)
|
|
|
|
|
assert elements[idx].text.startswith("LayoutParser")
|
2023-01-19 09:29:28 -05:00
|
|
|
|
|
2023-10-05 00:41:38 -07:00
|
|
|
|
assert elements[idx].metadata.filename == os.path.basename(filename)
|
|
|
|
|
assert elements[idx].metadata.file_directory == os.path.split(filename)[0]
|
2023-02-15 13:26:20 -05:00
|
|
|
|
|
2023-10-05 00:41:38 -07:00
|
|
|
|
idx += 1
|
|
|
|
|
assert isinstance(elements[idx], NarrativeText)
|
|
|
|
|
assert elements[idx].text.startswith("Zejiang Shen")
|
2023-05-31 13:50:15 -05:00
|
|
|
|
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
2023-04-21 12:01:29 -05:00
|
|
|
|
def test_auto_partition_pdf_uses_table_extraction():
|
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
|
|
|
|
with patch(
|
2023-12-01 12:56:31 -08:00
|
|
|
|
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
|
2023-04-21 12:01:29 -05:00
|
|
|
|
) as mock_process_file_with_model:
|
2023-11-15 21:41:02 -08:00
|
|
|
|
partition(filename, pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES)
|
2024-05-23 13:37:15 -07:00
|
|
|
|
assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
|
2023-04-21 12:01:29 -05:00
|
|
|
|
|
|
|
|
|
|
2023-08-21 23:00:21 -04:00
|
|
|
|
def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
|
2023-03-10 22:16:05 -05:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
|
|
|
|
|
|
|
|
|
mock_return = [NarrativeText("Hello there!")]
|
|
|
|
|
with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
|
2023-08-21 23:00:21 -04:00
|
|
|
|
mock_partition_with_extras_map = {"pdf": mock_partition}
|
|
|
|
|
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
|
2023-11-15 21:41:02 -08:00
|
|
|
|
partition(filename=filename, strategy=PartitionStrategy.FAST)
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
|
|
mock_partition.assert_called_once_with(
|
|
|
|
|
filename=filename,
|
|
|
|
|
file=None,
|
|
|
|
|
url=None,
|
2023-11-15 21:41:02 -08:00
|
|
|
|
strategy=PartitionStrategy.FAST,
|
2023-10-10 20:47:56 -05:00
|
|
|
|
languages=None,
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
|
metadata_filename=None,
|
|
|
|
|
include_page_breaks=False,
|
2024-05-17 11:28:11 -04:00
|
|
|
|
infer_table_structure=False,
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
|
extract_images_in_pdf=False,
|
2024-01-04 09:52:00 -08:00
|
|
|
|
extract_image_block_types=None,
|
|
|
|
|
extract_image_block_output_dir=None,
|
|
|
|
|
extract_image_block_to_payload=False,
|
2023-12-22 09:06:54 -06:00
|
|
|
|
hi_res_model_name=None,
|
2024-03-18 02:09:44 +01:00
|
|
|
|
date_from_file_object=False,
|
2024-04-15 23:03:42 +02:00
|
|
|
|
starting_page_number=1,
|
2023-03-10 22:16:05 -05:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2023-03-24 16:32:45 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-08-24 03:02:47 -04:00
|
|
|
|
("pass_metadata_filename", "content_type"),
|
2023-03-24 16:32:45 -07:00
|
|
|
|
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
|
|
|
|
)
|
2023-08-24 03:02:47 -04:00
|
|
|
|
def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, request):
|
2023-01-23 12:03:09 -05:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename = filename if pass_metadata_filename else None
|
2023-03-24 16:32:45 -07:00
|
|
|
|
|
2023-01-09 16:15:14 -05:00
|
|
|
|
with open(filename, "rb") as f:
|
2023-05-15 15:26:53 -04:00
|
|
|
|
elements = partition(
|
|
|
|
|
file=f,
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename=metadata_filename,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
content_type=content_type,
|
2023-11-15 21:41:02 -08:00
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
)
|
2023-01-27 12:08:18 -05:00
|
|
|
|
|
2024-05-15 18:53:15 -04:00
|
|
|
|
# NOTE(alan): Xfail since new model skips the word Zejiang
|
|
|
|
|
request.applymarker(pytest.mark.xfail)
|
|
|
|
|
|
2023-10-05 00:41:38 -07:00
|
|
|
|
idx = 3
|
|
|
|
|
assert isinstance(elements[idx], Title)
|
|
|
|
|
assert elements[idx].text.startswith("LayoutParser")
|
2023-01-27 12:08:18 -05:00
|
|
|
|
|
2023-10-05 00:41:38 -07:00
|
|
|
|
idx += 1
|
|
|
|
|
assert isinstance(elements[idx], NarrativeText)
|
|
|
|
|
assert elements[idx].text.startswith("Zejiang Shen")
|
2023-01-27 12:08:18 -05:00
|
|
|
|
|
|
|
|
|
|
2023-09-18 11:42:02 -04:00
|
|
|
|
def test_auto_partition_formats_languages_for_tesseract():
|
|
|
|
|
filename = "example-docs/chi_sim_image.jpeg"
|
|
|
|
|
with patch(
|
2023-12-01 12:56:31 -08:00
|
|
|
|
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
|
) as mock_process_file_with_ocr:
|
2023-11-15 21:41:02 -08:00
|
|
|
|
partition(filename, strategy=PartitionStrategy.HI_RES, languages=["zh"])
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
|
_, kwargs = mock_process_file_with_ocr.call_args_list[0]
|
|
|
|
|
assert "ocr_languages" in kwargs
|
|
|
|
|
assert kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
|
2023-09-18 11:42:02 -04:00
|
|
|
|
|
|
|
|
|
|
2023-09-26 14:09:27 -04:00
|
|
|
|
def test_auto_partition_element_metadata_user_provided_languages():
|
|
|
|
|
filename = "example-docs/chevron-page.pdf"
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(filename=filename, strategy=PartitionStrategy.OCR_ONLY, languages=["eng"])
|
2023-09-26 14:09:27 -04:00
|
|
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
|
|
|
|
|
2023-11-01 17:02:00 -05:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
("languages", "ocr_languages"),
|
|
|
|
|
[(["auto"], ""), (["eng"], "")],
|
|
|
|
|
)
|
|
|
|
|
def test_auto_partition_ignores_empty_string_for_ocr_languages(languages, ocr_languages):
|
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "book-war-and-peace-1p.txt")
|
|
|
|
|
elements = partition(
|
2023-11-15 21:41:02 -08:00
|
|
|
|
filename=filename,
|
|
|
|
|
strategy=PartitionStrategy.OCR_ONLY,
|
|
|
|
|
ocr_languages=ocr_languages,
|
|
|
|
|
languages=languages,
|
2023-11-01 17:02:00 -05:00
|
|
|
|
)
|
|
|
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
|
|
|
|
|
2023-09-13 13:07:28 -04:00
|
|
|
|
def test_auto_partition_warns_with_ocr_languages(caplog):
|
|
|
|
|
filename = "example-docs/chevron-page.pdf"
|
2023-11-15 21:41:02 -08:00
|
|
|
|
partition(filename=filename, strategy=PartitionStrategy.HI_RES, ocr_languages="eng")
|
2023-09-13 13:07:28 -04:00
|
|
|
|
assert "The ocr_languages kwarg will be deprecated" in caplog.text
|
|
|
|
|
|
|
|
|
|
|
2023-01-27 12:08:18 -05:00
|
|
|
|
def test_partition_pdf_doesnt_raise_warning():
|
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
|
|
|
|
# NOTE(robinson): This is the recommended way to check that no warning is emitted,
|
|
|
|
|
# per the pytest docs.
|
|
|
|
|
# ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
|
|
|
|
|
# #additional-use-cases-of-warnings-in-tests
|
|
|
|
|
with warnings.catch_warnings():
|
|
|
|
|
warnings.simplefilter("error")
|
2023-11-15 21:41:02 -08:00
|
|
|
|
partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
|
2023-08-02 09:22:20 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-08-24 03:02:47 -04:00
|
|
|
|
("pass_metadata_filename", "content_type"),
|
2023-08-02 09:22:20 -07:00
|
|
|
|
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
|
|
|
|
)
|
2024-01-29 10:44:41 -06:00
|
|
|
|
def test_auto_partition_image(pass_metadata_filename, content_type):
|
2023-08-02 09:22:20 -07:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename = filename if pass_metadata_filename else None
|
2023-08-02 09:22:20 -07:00
|
|
|
|
elements = partition(
|
|
|
|
|
filename=filename,
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename=metadata_filename,
|
2023-08-02 09:22:20 -07:00
|
|
|
|
content_type=content_type,
|
2023-11-15 21:41:02 -08:00
|
|
|
|
strategy=PartitionStrategy.AUTO,
|
2023-08-02 09:22:20 -07:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# should be same result as test_partition_image_default_strategy_hi_res() in test_image.py
|
2023-10-05 00:41:38 -07:00
|
|
|
|
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
2024-01-29 10:44:41 -06:00
|
|
|
|
idx = 2
|
2023-10-05 00:41:38 -07:00
|
|
|
|
assert elements[idx].text == title
|
|
|
|
|
assert elements[idx].metadata.coordinates is not None
|
2023-08-02 09:22:20 -07:00
|
|
|
|
|
|
|
|
|
|
2024-01-04 09:52:00 -08:00
|
|
|
|
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
|
|
|
|
def test_auto_partition_image_element_extraction(
|
|
|
|
|
extract_image_block_to_payload,
|
|
|
|
|
filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.jpg"),
|
|
|
|
|
):
|
|
|
|
|
extract_image_block_types = ["Image", "Table"]
|
|
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
|
|
|
elements = partition(
|
|
|
|
|
filename=filename,
|
|
|
|
|
extract_image_block_types=extract_image_block_types,
|
|
|
|
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
|
|
|
|
extract_image_block_output_dir=tmpdir,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert_element_extraction(
|
|
|
|
|
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2023-03-24 16:32:45 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-08-24 03:02:47 -04:00
|
|
|
|
("pass_metadata_filename", "content_type"),
|
2023-03-24 16:32:45 -07:00
|
|
|
|
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
|
|
|
|
)
|
2023-08-24 03:02:47 -04:00
|
|
|
|
def test_auto_partition_jpg(pass_metadata_filename, content_type):
|
2023-05-12 13:45:08 -04:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename = filename if pass_metadata_filename else None
|
2023-05-15 15:26:53 -04:00
|
|
|
|
elements = partition(
|
|
|
|
|
filename=filename,
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename=metadata_filename,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
content_type=content_type,
|
2023-11-15 21:41:02 -08:00
|
|
|
|
strategy=PartitionStrategy.AUTO,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
)
|
2023-01-13 22:24:13 -06:00
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
|
|
|
|
|
|
2023-03-24 16:32:45 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-08-24 03:02:47 -04:00
|
|
|
|
("pass_metadata_filename", "content_type"),
|
2023-03-24 16:32:45 -07:00
|
|
|
|
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
|
|
|
|
)
|
2023-08-24 03:02:47 -04:00
|
|
|
|
def test_auto_partition_jpg_from_file(pass_metadata_filename, content_type):
|
2023-05-12 13:45:08 -04:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename = filename if pass_metadata_filename else None
|
2023-01-13 22:24:13 -06:00
|
|
|
|
with open(filename, "rb") as f:
|
2023-05-15 15:26:53 -04:00
|
|
|
|
elements = partition(
|
|
|
|
|
file=f,
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename=metadata_filename,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
content_type=content_type,
|
2023-11-15 21:41:02 -08:00
|
|
|
|
strategy=PartitionStrategy.AUTO,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
)
|
2023-01-13 22:24:13 -06:00
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
|
|
|
|
|
|
2023-01-09 16:15:14 -05:00
|
|
|
|
def test_auto_partition_raises_with_bad_type(monkeypatch):
|
|
|
|
|
monkeypatch.setattr(auto, "detect_filetype", lambda *args, **kwargs: None)
|
|
|
|
|
with pytest.raises(ValueError):
|
2023-11-15 21:41:02 -08:00
|
|
|
|
partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
|
2023-01-23 12:03:09 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
EXPECTED_PPTX_OUTPUT = [
|
|
|
|
|
Title(text="Adding a Bullet Slide"),
|
|
|
|
|
ListItem(text="Find the bullet slide layout"),
|
|
|
|
|
ListItem(text="Use _TextFrame.text for first bullet"),
|
|
|
|
|
ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
|
|
|
|
|
NarrativeText(text="Here is a lot of text!"),
|
|
|
|
|
NarrativeText(text="Here is some text in a text box!"),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_pptx_from_filename():
|
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
2023-01-23 12:03:09 -05:00
|
|
|
|
assert elements == EXPECTED_PPTX_OUTPUT
|
2023-05-12 11:33:01 -04:00
|
|
|
|
assert elements[0].metadata.filename == os.path.basename(filename)
|
2023-05-15 18:25:39 -04:00
|
|
|
|
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
|
2023-02-08 10:11:15 -05:00
|
|
|
|
|
|
|
|
|
|
2023-03-30 16:54:29 -04:00
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
2023-02-17 11:57:08 -05:00
|
|
|
|
def test_auto_partition_ppt_from_filename():
|
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
2023-02-17 11:57:08 -05:00
|
|
|
|
assert elements == EXPECTED_PPTX_OUTPUT
|
2023-05-12 11:33:01 -04:00
|
|
|
|
assert elements[0].metadata.filename == os.path.basename(filename)
|
2023-05-15 18:25:39 -04:00
|
|
|
|
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
|
2023-02-17 11:57:08 -05:00
|
|
|
|
|
|
|
|
|
|
2023-02-08 10:11:15 -05:00
|
|
|
|
def test_auto_with_page_breaks():
|
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(
|
|
|
|
|
filename=filename, include_page_breaks=True, strategy=PartitionStrategy.HI_RES
|
|
|
|
|
)
|
2023-06-29 20:35:06 -05:00
|
|
|
|
assert "PageBreak" in [elem.category for elem in elements]
|
2023-03-14 11:52:21 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_epub_from_filename():
|
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
2023-03-14 11:52:21 -04:00
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_epub_from_file():
|
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
|
|
|
|
with open(filename, "rb") as f:
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
2023-03-14 11:52:21 -04:00
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
EXPECTED_MSG_OUTPUT = [
|
|
|
|
|
NarrativeText(text="This is a test email to use for unit tests."),
|
|
|
|
|
Title(text="Important points:"),
|
|
|
|
|
ListItem(text="Roses are red"),
|
|
|
|
|
ListItem(text="Violets are blue"),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_msg_from_filename():
|
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
2023-03-28 16:15:22 -04:00
|
|
|
|
assert elements == EXPECTED_MSG_OUTPUT
|
2023-04-10 17:25:03 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_rtf_from_filename():
|
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
2023-04-10 17:25:03 -04:00
|
|
|
|
assert elements[0] == Title("My First Heading")
|
2023-04-12 14:31:01 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_from_url():
|
|
|
|
|
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(url=url, content_type="text/plain", strategy=PartitionStrategy.HI_RES)
|
2023-04-12 14:31:01 -04:00
|
|
|
|
assert elements[0] == Title("Apache License")
|
|
|
|
|
assert elements[0].metadata.url == url
|
2023-04-13 11:04:15 -04:00
|
|
|
|
|
|
|
|
|
|
fix: parse URL response Content-Type according to RFC 9110 (#2950)
Currently, `file_and_type_from_url()` does not correctly handle the
`Content-Type` header. Specifically, it assumes that the header contains
only the mime-type (e.g. `text/html`), however, [RFC
9110](https://www.rfc-editor.org/rfc/rfc9110#field.content-type) allows
for additional directives — specifically the `charset` — to be returned
in the header. This leads to a `ValueError` when loading a URL with a
response Content-Type header such as `text/html; charset=UTF-8`.
To reproduce the issue:
```python
from unstructured.partition.auto import partition
url = "https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/"
partition(url=url)
```
Which will result in the following exception:
```python
{
"name": "ValueError",
"message": "Invalid file. The FileType.UNK file type is not supported in partition.",
"stack": "---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[1], line 4
1 from unstructured.partition.auto import partition
3 url = \"https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/\"
----> 4 partition(url=url)
File ~/miniconda3/envs/ai-tasks/lib/python3.11/site-packages/unstructured/partition/auto.py:541, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, hi_res_model_name, model_name, date_from_file_object, starting_page_number, **kwargs)
539 else:
540 msg = \"Invalid file\" if not filename else f\"Invalid file {filename}\"
--> 541 raise ValueError(f\"{msg}. The {filetype} file type is not supported in partition.\")
543 for element in elements:
544 element.metadata.url = url
ValueError: Invalid file. The FileType.UNK file type is not supported in partition."
}
```
This PR fixes the issue by parsing the mime-type out of the
`Content-Type` header string.
Closes #2257
2024-04-30 07:53:44 +02:00
|
|
|
|
def test_auto_partition_from_url_with_rfc9110_content_type():
|
|
|
|
|
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
|
|
|
|
|
elements = partition(
|
|
|
|
|
url=url, content_type="text/plain; charset=utf-8", strategy=PartitionStrategy.HI_RES
|
|
|
|
|
)
|
|
|
|
|
assert elements[0] == Title("Apache License")
|
|
|
|
|
assert elements[0].metadata.url == url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_from_url_without_providing_content_type():
|
|
|
|
|
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
|
|
|
|
|
elements = partition(url=url, strategy=PartitionStrategy.HI_RES)
|
|
|
|
|
assert elements[0] == Title("Apache License")
|
|
|
|
|
assert elements[0].metadata.url == url
|
|
|
|
|
|
|
|
|
|
|
2023-04-13 12:47:45 -04:00
|
|
|
|
def test_partition_md_works_with_embedded_html():
|
|
|
|
|
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)
|
2023-04-13 12:47:45 -04:00
|
|
|
|
elements[0].text
|
|
|
|
|
unstructured_found = False
|
|
|
|
|
for element in elements:
|
|
|
|
|
if "unstructured" in elements[0].text:
|
|
|
|
|
unstructured_found = True
|
|
|
|
|
break
|
|
|
|
|
assert unstructured_found is True
|
|
|
|
|
|
|
|
|
|
|
2023-04-13 11:04:15 -04:00
|
|
|
|
def test_auto_partition_warns_if_header_set_and_not_url(caplog):
|
2023-06-16 17:52:13 -07:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
|
2023-11-15 21:41:02 -08:00
|
|
|
|
partition(
|
|
|
|
|
filename=filename, headers={"Accept": "application/pdf"}, strategy=PartitionStrategy.HI_RES
|
|
|
|
|
)
|
2023-04-13 11:04:15 -04:00
|
|
|
|
assert caplog.records[0].levelname == "WARNING"
|
2023-04-26 13:52:47 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_works_with_unstructured_jsons():
|
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
2023-04-26 13:52:47 -04:00
|
|
|
|
assert elements[0].text == "News Around NOAA"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_works_with_unstructured_jsons_from_file():
|
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
|
|
|
|
|
with open(filename, "rb") as f:
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
2023-04-26 13:52:47 -04:00
|
|
|
|
assert elements[0].text == "News Around NOAA"
|
2023-05-04 15:28:08 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_odt_from_filename():
|
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
2023-09-16 22:29:44 -07:00
|
|
|
|
assert elements[0] == Title("Lorem ipsum dolor sit amet.")
|
2023-05-04 15:28:08 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_odt_from_file():
|
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
|
|
|
|
with open(filename, "rb") as f:
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
2023-05-04 15:28:08 -04:00
|
|
|
|
|
2023-09-16 22:29:44 -07:00
|
|
|
|
assert elements[0] == Title("Lorem ipsum dolor sit amet.")
|
2023-05-15 13:23:19 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
("content_type", "routing_func", "expected"),
|
|
|
|
|
[
|
2023-07-25 15:59:45 -04:00
|
|
|
|
("text/csv", "csv", "text/csv"),
|
2023-05-15 13:23:19 -05:00
|
|
|
|
("text/html", "html", "text/html"),
|
|
|
|
|
("jdsfjdfsjkds", "pdf", None),
|
|
|
|
|
],
|
|
|
|
|
)
|
2023-08-21 23:00:21 -04:00
|
|
|
|
def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected, monkeypatch):
|
2023-05-15 13:23:19 -05:00
|
|
|
|
with patch(
|
|
|
|
|
f"unstructured.partition.auto.partition_{routing_func}",
|
|
|
|
|
lambda *args, **kwargs: [Text("text 1"), Text("text 2")],
|
2023-08-21 23:00:21 -04:00
|
|
|
|
) as mock_partition:
|
|
|
|
|
mock_partition_with_extras_map = {routing_func: mock_partition}
|
|
|
|
|
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
|
2023-05-15 13:23:19 -05:00
|
|
|
|
elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
|
|
|
|
|
assert len(elements) == 2
|
|
|
|
|
assert all(el.metadata.filetype == expected for el in elements)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
("content_type", "expected"),
|
|
|
|
|
[
|
|
|
|
|
("application/pdf", FILETYPE_TO_MIMETYPE[FileType.PDF]),
|
|
|
|
|
(None, FILETYPE_TO_MIMETYPE[FileType.PDF]),
|
|
|
|
|
],
|
|
|
|
|
)
|
2023-08-21 23:00:21 -04:00
|
|
|
|
def test_auto_filetype_overrides_file_specific(content_type, expected, monkeypatch):
|
2023-05-15 13:23:19 -05:00
|
|
|
|
pdf_metadata = ElementMetadata(filetype="imapdf")
|
|
|
|
|
with patch(
|
|
|
|
|
"unstructured.partition.auto.partition_pdf",
|
|
|
|
|
lambda *args, **kwargs: [
|
|
|
|
|
Text("text 1", metadata=pdf_metadata),
|
|
|
|
|
Text("text 2", metadata=pdf_metadata),
|
|
|
|
|
],
|
2023-08-21 23:00:21 -04:00
|
|
|
|
) as mock_partition:
|
|
|
|
|
mock_partition_with_extras_map = {"pdf": mock_partition}
|
|
|
|
|
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
|
2023-05-15 13:23:19 -05:00
|
|
|
|
elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
|
|
|
|
|
assert len(elements) == 2
|
|
|
|
|
assert all(el.metadata.filetype == expected for el in elements)
|
|
|
|
|
|
|
|
|
|
|
2024-01-04 09:52:00 -08:00
|
|
|
|
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
|
|
|
|
def test_auto_partition_pdf_element_extraction(
|
|
|
|
|
extract_image_block_to_payload,
|
|
|
|
|
filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.pdf"),
|
|
|
|
|
):
|
|
|
|
|
extract_image_block_types = ["Image", "Table"]
|
|
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
|
|
|
elements = partition(
|
|
|
|
|
filename=filename,
|
|
|
|
|
extract_image_block_types=extract_image_block_types,
|
|
|
|
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
|
|
|
|
extract_image_block_output_dir=tmpdir,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert_element_extraction(
|
|
|
|
|
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2023-05-15 13:23:19 -05:00
|
|
|
|
supported_filetypes = [
|
|
|
|
|
_
|
|
|
|
|
for _ in FileType
|
|
|
|
|
if _
|
|
|
|
|
not in (
|
|
|
|
|
FileType.UNK,
|
|
|
|
|
FileType.ZIP,
|
|
|
|
|
FileType.XLS,
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FILETYPE_TO_MODULE = {
|
|
|
|
|
FileType.JPG: "image",
|
|
|
|
|
FileType.PNG: "image",
|
2024-01-29 22:49:00 -06:00
|
|
|
|
FileType.HEIC: "image",
|
2023-05-15 13:23:19 -05:00
|
|
|
|
FileType.TXT: "text",
|
|
|
|
|
FileType.EML: "email",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("filetype", supported_filetypes)
|
|
|
|
|
def test_file_specific_produces_correct_filetype(filetype: FileType):
|
2024-01-17 17:50:36 -05:00
|
|
|
|
if filetype in auto.IMAGE_FILETYPES or filetype in (FileType.WAV, FileType.EMPTY):
|
2023-05-15 13:23:19 -05:00
|
|
|
|
pytest.skip()
|
|
|
|
|
extension = filetype.name.lower()
|
2024-02-06 21:28:55 +00:00
|
|
|
|
filetype_module = FILETYPE_TO_MODULE.get(filetype, extension)
|
2023-05-15 13:23:19 -05:00
|
|
|
|
fun_name = "partition_" + filetype_module
|
2023-12-15 14:29:58 -08:00
|
|
|
|
module = import_module(f"unstructured.partition.{filetype_module}") # noqa
|
2023-05-15 13:23:19 -05:00
|
|
|
|
fun = eval(f"module.{fun_name}")
|
|
|
|
|
for file in pathlib.Path("example-docs").iterdir():
|
|
|
|
|
if file.is_file() and file.suffix == f".{extension}":
|
|
|
|
|
elements = fun(str(file))
|
2023-06-09 16:07:50 -04:00
|
|
|
|
assert all(
|
|
|
|
|
el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype]
|
|
|
|
|
for el in elements
|
|
|
|
|
if el.metadata.filetype is not None
|
|
|
|
|
)
|
2023-05-15 13:23:19 -05:00
|
|
|
|
break
|
2023-05-16 15:40:40 -04:00
|
|
|
|
|
|
|
|
|
|
2023-05-18 11:40:12 -04:00
|
|
|
|
def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"):
|
feat: `partition_xml` infers element type on each leaf node (#1249)
### Summary
Closes #1229. Updates `partition_xml` so that the element type is
inferred on each leaf node when `xml_keep_tags=False` instead of
delegating splitting and partitioning to `partition_xml`. If
`xml_keep_tags=True`, the file is treated like a text file still and
partitioning is still delegated to `partition_text`.
Also adds the option to pass `text` as an input to `partition_xml`.
### Testing
Create a `parrots.xml` file that looks like:
```xml
<xml><parrot><name>Conure</name><description>A conure is a very friendly bird.
Conures are feathery and like to dance.</description></parrot></xml>
```
Run:
```python
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import convert_to_dict
elements = partition_xml(filename="parrots.xml")
convert_to_dict(elements)
```
One `main`, the output is the following. Notice how the `<name>` tag
incorrectly gets merged into `<description>` in the first element.
```python
[{'element_id': '7ae4074435df8dfcefcf24a4e6c52026',
'metadata': {'file_directory': '/home/matt/tmp',
'filename': 'parrots.xml',
'filetype': 'application/xml',
'last_modified': '2023-08-30T14:21:38'},
'text': 'Conure A conure is a very friendly bird.',
'type': 'NarrativeText'},
{'element_id': '859ecb332da6961acd2fb6a0185d1549',
'metadata': {'file_directory': '/home/matt/tmp',
'filename': 'parrots.xml',
'filetype': 'application/xml',
'last_modified': '2023-08-30T14:21:38'},
'text': 'Conures are feathery and like to dance.',
'type': 'NarrativeText'}]
```
One the feature branch, the output is the following, and the tags are
correctly separated.
```python
[{'element_id': '5512218914e4eeacf71a9cd42c373710',
'metadata': {'file_directory': '/home/matt/tmp',
'filename': 'parrots.xml',
'filetype': 'application/xml',
'last_modified': '2023-08-30T14:21:38'},
'text': 'Conure',
'type': 'Title'},
{'element_id': '113bf8d250c2b1a77c9c2caa4b812f85',
'metadata': {'file_directory': '/home/matt/tmp',
'filename': 'parrots.xml',
'filetype': 'application/xml',
'last_modified': '2023-08-30T14:21:38'},
'text': 'A conure is a very friendly bird.\n'
'\n'
'Conures are feathery and like to dance.',
'type': 'NarrativeText'}]
```
2023-08-30 17:07:10 -04:00
|
|
|
|
elements = partition(filename=filename, xml_keep_tags=False, metadata_filename=filename)
|
2023-05-18 11:40:12 -04:00
|
|
|
|
|
|
|
|
|
assert elements[0].text == "United States"
|
|
|
|
|
assert elements[0].metadata.filename == "factbook.xml"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_xml_from_file(filename="example-docs/factbook.xml"):
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
|
elements = partition(file=f, xml_keep_tags=False)
|
|
|
|
|
|
|
|
|
|
assert elements[0].text == "United States"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
|
|
|
|
|
elements = partition(filename=filename, xml_keep_tags=True)
|
|
|
|
|
|
feat: `partition_xml` infers element type on each leaf node (#1249)
### Summary
Closes #1229. Updates `partition_xml` so that the element type is
inferred on each leaf node when `xml_keep_tags=False` instead of
delegating splitting and partitioning to `partition_xml`. If
`xml_keep_tags=True`, the file is treated like a text file still and
partitioning is still delegated to `partition_text`.
Also adds the option to pass `text` as an input to `partition_xml`.
### Testing
Create a `parrots.xml` file that looks like:
```xml
<xml><parrot><name>Conure</name><description>A conure is a very friendly bird.
Conures are feathery and like to dance.</description></parrot></xml>
```
Run:
```python
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import convert_to_dict
elements = partition_xml(filename="parrots.xml")
convert_to_dict(elements)
```
One `main`, the output is the following. Notice how the `<name>` tag
incorrectly gets merged into `<description>` in the first element.
```python
[{'element_id': '7ae4074435df8dfcefcf24a4e6c52026',
'metadata': {'file_directory': '/home/matt/tmp',
'filename': 'parrots.xml',
'filetype': 'application/xml',
'last_modified': '2023-08-30T14:21:38'},
'text': 'Conure A conure is a very friendly bird.',
'type': 'NarrativeText'},
{'element_id': '859ecb332da6961acd2fb6a0185d1549',
'metadata': {'file_directory': '/home/matt/tmp',
'filename': 'parrots.xml',
'filetype': 'application/xml',
'last_modified': '2023-08-30T14:21:38'},
'text': 'Conures are feathery and like to dance.',
'type': 'NarrativeText'}]
```
One the feature branch, the output is the following, and the tags are
correctly separated.
```python
[{'element_id': '5512218914e4eeacf71a9cd42c373710',
'metadata': {'file_directory': '/home/matt/tmp',
'filename': 'parrots.xml',
'filetype': 'application/xml',
'last_modified': '2023-08-30T14:21:38'},
'text': 'Conure',
'type': 'Title'},
{'element_id': '113bf8d250c2b1a77c9c2caa4b812f85',
'metadata': {'file_directory': '/home/matt/tmp',
'filename': 'parrots.xml',
'filetype': 'application/xml',
'last_modified': '2023-08-30T14:21:38'},
'text': 'A conure is a very friendly bird.\n'
'\n'
'Conures are feathery and like to dance.',
'type': 'NarrativeText'}]
```
2023-08-30 17:07:10 -04:00
|
|
|
|
assert "<leader>Joe Biden</leader>" in elements[0].text
|
|
|
|
|
assert elements[0].metadata.filename == "factbook.xml"
|
2023-05-18 11:40:12 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
|
elements = partition(file=f, xml_keep_tags=True)
|
|
|
|
|
|
feat: `partition_xml` infers element type on each leaf node (#1249)
### Summary
Closes #1229. Updates `partition_xml` so that the element type is
inferred on each leaf node when `xml_keep_tags=False` instead of
delegating splitting and partitioning to `partition_xml`. If
`xml_keep_tags=True`, the file is treated like a text file still and
partitioning is still delegated to `partition_text`.
Also adds the option to pass `text` as an input to `partition_xml`.
### Testing
Create a `parrots.xml` file that looks like:
```xml
<xml><parrot><name>Conure</name><description>A conure is a very friendly bird.
Conures are feathery and like to dance.</description></parrot></xml>
```
Run:
```python
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import convert_to_dict
elements = partition_xml(filename="parrots.xml")
convert_to_dict(elements)
```
One `main`, the output is the following. Notice how the `<name>` tag
incorrectly gets merged into `<description>` in the first element.
```python
[{'element_id': '7ae4074435df8dfcefcf24a4e6c52026',
'metadata': {'file_directory': '/home/matt/tmp',
'filename': 'parrots.xml',
'filetype': 'application/xml',
'last_modified': '2023-08-30T14:21:38'},
'text': 'Conure A conure is a very friendly bird.',
'type': 'NarrativeText'},
{'element_id': '859ecb332da6961acd2fb6a0185d1549',
'metadata': {'file_directory': '/home/matt/tmp',
'filename': 'parrots.xml',
'filetype': 'application/xml',
'last_modified': '2023-08-30T14:21:38'},
'text': 'Conures are feathery and like to dance.',
'type': 'NarrativeText'}]
```
One the feature branch, the output is the following, and the tags are
correctly separated.
```python
[{'element_id': '5512218914e4eeacf71a9cd42c373710',
'metadata': {'file_directory': '/home/matt/tmp',
'filename': 'parrots.xml',
'filetype': 'application/xml',
'last_modified': '2023-08-30T14:21:38'},
'text': 'Conure',
'type': 'Title'},
{'element_id': '113bf8d250c2b1a77c9c2caa4b812f85',
'metadata': {'file_directory': '/home/matt/tmp',
'filename': 'parrots.xml',
'filetype': 'application/xml',
'last_modified': '2023-08-30T14:21:38'},
'text': 'A conure is a very friendly bird.\n'
'\n'
'Conures are feathery and like to dance.',
'type': 'NarrativeText'}]
```
2023-08-30 17:07:10 -04:00
|
|
|
|
assert "<leader>Joe Biden</leader>" in elements[0].text
|
2023-05-18 11:40:12 -04:00
|
|
|
|
|
|
|
|
|
|
2023-05-16 15:40:40 -04:00
|
|
|
|
EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
|
2023-10-23 17:11:53 -07:00
|
|
|
|
elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])
|
2023-05-16 15:40:40 -04:00
|
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
|
assert sum(isinstance(element, Title) for element in elements) == 2
|
|
|
|
|
assert len(elements) == 4
|
2023-05-16 15:40:40 -04:00
|
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
|
fix: stop csv and tsv dropping the first line of the file (#1530)
The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.
Here is a snippet of code that demonstrates the current behavior and the
proposed fix
```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
c1 = """
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
"""
f = "./test.csv"
with open(f, 'w') as ff:
ff.write(c1)
print("Suggested Improvement Keep First Line")
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
print("\n\nOriginal Looses First Line")
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```
---------
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-17 00:59:35 +02:00
|
|
|
|
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
|
|
|
|
|
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
|
assert elements[1].metadata.page_number == 1
|
|
|
|
|
assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
|
2023-05-16 15:40:40 -04:00
|
|
|
|
|
|
|
|
|
|
2023-10-23 17:11:53 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
("skip_infer_table_types", "filename", "has_text_as_html_field"),
|
|
|
|
|
[
|
|
|
|
|
(["xlsx"], "stanley-cups.xlsx", False),
|
|
|
|
|
([], "stanley-cups.xlsx", True),
|
|
|
|
|
(["odt"], "fake.odt", False),
|
|
|
|
|
([], "fake.odt", True),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_auto_partition_respects_skip_infer_table_types(
|
2023-10-24 13:13:28 -04:00
|
|
|
|
skip_infer_table_types,
|
|
|
|
|
filename,
|
|
|
|
|
has_text_as_html_field,
|
2023-10-23 17:11:53 -07:00
|
|
|
|
):
|
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
|
table_elements = [
|
|
|
|
|
e
|
|
|
|
|
for e in partition(file=f, skip_infer_table_types=skip_infer_table_types)
|
|
|
|
|
if isinstance(e, Table)
|
|
|
|
|
]
|
|
|
|
|
for table_element in table_elements:
|
|
|
|
|
table_element_has_text_as_html_field = (
|
|
|
|
|
hasattr(table_element.metadata, "text_as_html")
|
|
|
|
|
and table_element.metadata.text_as_html is not None
|
|
|
|
|
)
|
|
|
|
|
assert table_element_has_text_as_html_field == has_text_as_html_field
|
|
|
|
|
|
|
|
|
|
|
2023-05-16 15:40:40 -04:00
|
|
|
|
def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
|
|
|
|
|
with open(filename, "rb") as f:
|
2023-10-23 17:11:53 -07:00
|
|
|
|
elements = partition(file=f, include_header=False, skip_infer_table_types=[])
|
2023-05-16 15:40:40 -04:00
|
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
|
assert sum(isinstance(element, Title) for element in elements) == 2
|
|
|
|
|
assert len(elements) == 4
|
2023-05-16 15:40:40 -04:00
|
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
|
fix: stop csv and tsv dropping the first line of the file (#1530)
The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.
Here is a snippet of code that demonstrates the current behavior and the
proposed fix
```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
c1 = """
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
"""
f = "./test.csv"
with open(f, 'w') as ff:
ff.write(c1)
print("Suggested Improvement Keep First Line")
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
print("\n\nOriginal Looses First Line")
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```
---------
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-17 00:59:35 +02:00
|
|
|
|
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
|
|
|
|
|
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
|
assert elements[1].metadata.page_number == 1
|
|
|
|
|
assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
|
2023-05-19 15:57:42 -04:00
|
|
|
|
|
|
|
|
|
|
2024-04-15 23:03:42 +02:00
|
|
|
|
def test_auto_partition_respects_starting_page_number_argument_for_xlsx():
|
|
|
|
|
elements = partition("example-docs/stanley-cups.xlsx", starting_page_number=3)
|
|
|
|
|
assert elements[1].metadata.page_number == 3
|
|
|
|
|
|
|
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
|
EXPECTED_XLS_TEXT_LEN = 550
|
2023-08-10 13:57:46 -07:00
|
|
|
|
|
2023-05-26 01:55:32 -07:00
|
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
|
EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MC What is 2+2? 4 correct 3 incorrect MA What"
|
2023-05-26 01:55:32 -07:00
|
|
|
|
|
|
|
|
|
EXPECTED_XLS_TABLE = (
|
|
|
|
|
"""<table border="1" class="dataframe">
|
|
|
|
|
<tbody>
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
|
<tr>
|
|
|
|
|
<td>MC</td>
|
|
|
|
|
<td>What is 2+2?</td>
|
|
|
|
|
<td>4</td>
|
|
|
|
|
<td>correct</td>
|
|
|
|
|
<td>3</td>
|
|
|
|
|
<td>incorrect</td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
</tr>
|
2023-05-26 01:55:32 -07:00
|
|
|
|
<tr>
|
|
|
|
|
<td>MA</td>
|
|
|
|
|
<td>What C datatypes are 8 bits? (assume i386)</td>
|
|
|
|
|
<td>int</td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td>float</td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td>double</td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td>char</td>
|
|
|
|
|
</tr>
|
|
|
|
|
<tr>
|
|
|
|
|
<td>TF</td>
|
|
|
|
|
<td>Bagpipes are awesome.</td>
|
|
|
|
|
<td>true</td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
</tr>
|
|
|
|
|
<tr>
|
|
|
|
|
<td>ESS</td>
|
|
|
|
|
<td>How have the original Henry Hornbostel buildings """
|
|
|
|
|
"""influenced campus architecture and design in the last 30 years?</td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
</tr>
|
|
|
|
|
<tr>
|
|
|
|
|
<td>ORD</td>
|
|
|
|
|
<td>Rank the following in their order of operation.</td>
|
|
|
|
|
<td>Parentheses</td>
|
|
|
|
|
<td>Exponents</td>
|
|
|
|
|
<td>Division</td>
|
|
|
|
|
<td>Addition</td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
</tr>
|
|
|
|
|
<tr>
|
|
|
|
|
<td>FIB</td>
|
|
|
|
|
<td>The student activities fee is</td>
|
|
|
|
|
<td>95</td>
|
|
|
|
|
<td>dollars for students enrolled in</td>
|
|
|
|
|
<td>19</td>
|
|
|
|
|
<td>units or more,</td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
<td></td>
|
|
|
|
|
</tr>
|
|
|
|
|
<tr>
|
|
|
|
|
<td>MAT</td>
|
|
|
|
|
<td>Match the lower-case greek letter with its capital form.</td>
|
|
|
|
|
<td>λ</td>
|
|
|
|
|
<td>Λ</td>
|
|
|
|
|
<td>α</td>
|
|
|
|
|
<td>γ</td>
|
|
|
|
|
<td>Γ</td>
|
|
|
|
|
<td>φ</td>
|
|
|
|
|
<td>Φ</td>
|
|
|
|
|
</tr>
|
|
|
|
|
</tbody>
|
|
|
|
|
</table>"""
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2023-08-04 16:14:08 +02:00
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
2023-05-26 01:55:32 -07:00
|
|
|
|
def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
|
2023-10-23 17:11:53 -07:00
|
|
|
|
elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])
|
2023-05-26 01:55:32 -07:00
|
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
fix(xlsx): xlsx subtable algorithm (#2534)
**Reviewers:** It may be easier to review each of the two commits
separately. The first adds the new `_SubtableParser` object with its
unit-tests and the second one uses that object to replace the flawed
existing subtable-parsing algorithm.
**Summary**
There are a cluster of bugs in `partition_xlsx()` that all derive from
flaws in the algorithm we use to detect "subtables". These are
encountered when the user wants to get multiple document-elements from
each worksheet, which is the default (argument `find_subtable = True`).
This PR replaces the flawed existing algorithm with a `_SubtableParser`
object that encapsulates all that logic and has thorough unit-tests.
**Additional Context**
This is a summary of the failure cases. There are a few other cases but
they're closely related and this was enough evidence and scope for my
purposes. This PR fixes all these bugs:
```python
#
# -- ✅ CASE 1: There are no leading or trailing single-cell rows.
# -> this subtable functions never get called, subtable is emitted as the only element
#
# a b -> Table(a, b, c, d)
# c d
# -- ✅ CASE 2: There is exactly one leading single-cell row.
# -> Leading single-cell row emitted as `Title` element, core-table properly identified.
#
# a -> [ Title(a),
# b c Table(b, c, d, e) ]
# d e
# -- ❌ CASE 3: There are two-or-more leading single-cell rows.
# -> leading single-cell rows are included in subtable
#
# a -> [ Table(a, b, c, d, e, f) ]
# b
# c d
# e f
# -- ❌ CASE 4: There is exactly one trailing single-cell row.
# -> core table is dropped. trailing single-cell row is emitted as Title
# (this is the behavior in the reported bug)
#
# a b -> [ Title(e) ]
# c d
# e
# -- ❌ CASE 5: There are two-or-more trailing single-cell rows.
# -> core table is dropped. trailing single-cell rows are each emitted as a Title
#
# a b -> [ Title(e),
# c d Title(f) ]
# e
# f
# -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows.
# -> core table is correctly identified, leading and trailing single-cell rows are each
# emitted as a Title.
#
# a -> [ Title(a),
# b c Table(b, c, d, e),
# d e Title(f) ]
# f
# -- ✅ CASE 7: There are two leading and one trailing single-cell rows.
# -> core table is correctly identified, leading and trailing single-cell rows are each
# emitted as a Title.
#
# a -> [ Title(a),
# b Title(b),
# c d Table(c, d, e, f),
# e f Title(g) ]
# g
# -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows.
# -> core table is correctly identified, leading and trailing single-cell rows are each
# emitted as a Title.
#
# a -> [ Title(a),
# b Title(b),
# c d Table(c, d, e, f),
# e f Title(g),
# g Title(h) ]
# h
# -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below.
# -> First cell is mistakenly emitted as title, remaining cells are dropped.
#
# a b c -> [ Title(a) ]
# -- ❌ CASE 10: Single-row subtable with one leading single-cell row.
# -> Leading single-row cell is correctly identified as title, core-table is mis-identified
# as a `Title` and truncated.
#
# a -> [ Title(a),
# b c d Title(b) ]
```
2024-02-13 20:29:17 -08:00
|
|
|
|
assert len(elements) == 14
|
2023-05-26 01:55:32 -07:00
|
|
|
|
|
|
|
|
|
assert clean_extra_whitespace(elements[0].text)[:45] == EXPECTED_XLS_INITIAL_45_CLEAN_TEXT
|
2023-08-10 13:57:46 -07:00
|
|
|
|
# NOTE(crag): if the beautifulsoup4 package is installed, some (but not all) additional
|
|
|
|
|
# whitespace is removed, so the expected text length is less than is the case
|
|
|
|
|
# when beautifulsoup4 is *not* installed. E.g.
|
|
|
|
|
# "\n\n\nMA\nWhat C datatypes are 8 bits" vs.
|
|
|
|
|
# '\n \n \n MA\n What C datatypes are 8 bits?... "
|
2023-05-26 01:55:32 -07:00
|
|
|
|
assert len(elements[0].text) == EXPECTED_XLS_TEXT_LEN
|
|
|
|
|
assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE
|
|
|
|
|
|
|
|
|
|
|
2023-05-19 17:40:26 -04:00
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
2023-05-19 15:57:42 -04:00
|
|
|
|
def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
|
|
|
|
|
elements = partition(filename=filename)
|
|
|
|
|
|
2023-06-15 13:50:53 -05:00
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
|
|
|
|
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
2023-05-19 15:57:42 -04:00
|
|
|
|
assert elements[0].metadata.filetype == "text/csv"
|
|
|
|
|
|
|
|
|
|
|
2023-07-27 13:33:36 -04:00
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
|
|
|
def test_auto_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
|
|
|
|
|
elements = partition(filename=filename)
|
|
|
|
|
|
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
|
|
|
|
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
|
|
|
|
assert elements[0].metadata.filetype == "text/tsv"
|
|
|
|
|
|
|
|
|
|
|
2023-05-19 17:40:26 -04:00
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
2023-05-19 15:57:42 -04:00
|
|
|
|
def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
|
elements = partition(file=f)
|
|
|
|
|
|
2023-06-15 13:50:53 -05:00
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
2023-05-19 15:57:42 -04:00
|
|
|
|
assert isinstance(elements[0], Table)
|
2023-06-15 13:50:53 -05:00
|
|
|
|
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
2023-05-19 15:57:42 -04:00
|
|
|
|
assert elements[0].metadata.filetype == "text/csv"
|
2023-06-09 16:07:50 -04:00
|
|
|
|
|
|
|
|
|
|
2023-06-27 21:52:39 +03:00
|
|
|
|
def test_auto_partition_html_pre_from_file(filename="example-docs/fake-html-pre.htm"):
|
|
|
|
|
elements = partition(filename=filename)
|
|
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
2023-06-28 23:14:05 -04:00
|
|
|
|
assert "PageBreak" not in [elem.category for elem in elements]
|
2023-08-25 00:14:48 -04:00
|
|
|
|
assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
|
|
|
|
|
assert isinstance(elements[0], NarrativeText)
|
2023-06-27 21:52:39 +03:00
|
|
|
|
assert elements[0].metadata.filetype == "text/html"
|
|
|
|
|
assert elements[0].metadata.filename == "fake-html-pre.htm"
|
|
|
|
|
|
|
|
|
|
|
2023-06-09 16:07:50 -04:00
|
|
|
|
def test_auto_partition_works_on_empty_filename(filename="example-docs/empty.txt"):
|
|
|
|
|
assert partition(filename=filename) == []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
|
assert partition(file=f) == []
|
2023-06-12 15:31:10 -04:00
|
|
|
|
|
|
|
|
|
|
2023-06-23 20:45:31 +02:00
|
|
|
|
def test_auto_partition_org_from_filename(filename="example-docs/README.org"):
|
|
|
|
|
elements = partition(filename=filename)
|
|
|
|
|
|
|
|
|
|
assert elements[0] == Title("Example Docs")
|
|
|
|
|
assert elements[0].metadata.filetype == "text/org"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_org_from_file(filename="example-docs/README.org"):
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
|
elements = partition(file=f, content_type="text/org")
|
|
|
|
|
|
|
|
|
|
assert elements[0] == Title("Example Docs")
|
|
|
|
|
assert elements[0].metadata.filetype == "text/org"
|
|
|
|
|
|
|
|
|
|
|
2023-06-12 15:31:10 -04:00
|
|
|
|
def test_auto_partition_rst_from_filename(filename="example-docs/README.rst"):
|
|
|
|
|
elements = partition(filename=filename)
|
|
|
|
|
|
|
|
|
|
assert elements[0] == Title("Example Docs")
|
|
|
|
|
assert elements[0].metadata.filetype == "text/x-rst"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_rst_from_file(filename="example-docs/README.rst"):
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
|
elements = partition(file=f, content_type="text/x-rst")
|
|
|
|
|
|
|
|
|
|
assert elements[0] == Title("Example Docs")
|
|
|
|
|
assert elements[0].metadata.filetype == "text/x-rst"
|
2023-08-02 18:14:15 -07:00
|
|
|
|
|
|
|
|
|
|
2023-08-24 03:02:47 -04:00
|
|
|
|
def test_auto_partition_metadata_filename():
|
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
|
|
|
|
with open(filename) as f:
|
|
|
|
|
elements = partition(file=f, metadata_filename=filename)
|
|
|
|
|
assert elements[0].metadata.filename == os.path.split(filename)[-1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_warns_about_file_filename_deprecation(caplog):
|
2023-08-02 18:14:15 -07:00
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
|
|
|
|
with open(filename) as f:
|
|
|
|
|
elements = partition(file=f, file_filename=filename)
|
|
|
|
|
assert elements[0].metadata.filename == os.path.split(filename)[-1]
|
2023-08-24 03:02:47 -04:00
|
|
|
|
assert "WARNING" in caplog.text
|
|
|
|
|
assert "The file_filename kwarg will be deprecated" in caplog.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_raises_with_file_and_metadata_filename():
|
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
|
|
|
|
with open(filename) as f, pytest.raises(ValueError):
|
|
|
|
|
partition(file=f, file_filename=filename, metadata_filename=filename)
|
2023-08-21 23:00:21 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_get_partition_with_extras_prompts_for_install_if_missing():
|
|
|
|
|
partition_with_extras_map = {}
|
|
|
|
|
with pytest.raises(ImportError) as exception_info:
|
|
|
|
|
_get_partition_with_extras("pdf", partition_with_extras_map)
|
|
|
|
|
|
|
|
|
|
msg = str(exception_info.value)
|
|
|
|
|
assert 'Install the pdf dependencies with pip install "unstructured[pdf]"' in msg
|
2023-09-11 16:00:14 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_add_chunking_strategy_on_partition_auto():
|
|
|
|
|
filename = "example-docs/example-10k-1p.html"
|
|
|
|
|
elements = partition(filename)
|
2023-10-03 09:40:34 -07:00
|
|
|
|
chunk_elements = partition(filename, chunking_strategy="by_title")
|
2023-09-11 16:00:14 -05:00
|
|
|
|
chunks = chunk_by_title(elements)
|
|
|
|
|
assert chunk_elements != elements
|
|
|
|
|
assert chunk_elements == chunks
|
|
|
|
|
|
|
|
|
|
|
2023-10-03 09:40:34 -07:00
|
|
|
|
def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
|
|
|
|
|
filename = "example-docs/example-10k-1p.html"
|
|
|
|
|
|
|
|
|
|
# default chunk size in chars is 200
|
|
|
|
|
partitioned_table_elements_200_chars = [
|
|
|
|
|
e
|
|
|
|
|
for e in partition(
|
|
|
|
|
filename,
|
|
|
|
|
chunking_strategy="by_title",
|
|
|
|
|
max_characters=200,
|
|
|
|
|
combine_text_under_n_chars=5,
|
|
|
|
|
)
|
|
|
|
|
if isinstance(e, (Table, TableChunk))
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
partitioned_table_elements_5_chars = [
|
|
|
|
|
e
|
|
|
|
|
for e in partition(
|
|
|
|
|
filename,
|
|
|
|
|
chunking_strategy="by_title",
|
|
|
|
|
max_characters=5,
|
|
|
|
|
combine_text_under_n_chars=5,
|
|
|
|
|
)
|
|
|
|
|
if isinstance(e, (Table, TableChunk))
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
elements = partition(filename)
|
|
|
|
|
|
|
|
|
|
table_elements = [e for e in elements if isinstance(e, Table)]
|
|
|
|
|
|
|
|
|
|
assert len(partitioned_table_elements_5_chars) != len(table_elements)
|
|
|
|
|
assert len(partitioned_table_elements_200_chars) != len(table_elements)
|
|
|
|
|
|
2024-01-10 14:19:24 -08:00
|
|
|
|
# trailing whitespace is stripped from the first chunk, leaving only a checkbox character
|
|
|
|
|
assert len(partitioned_table_elements_5_chars[0].text) == 1
|
|
|
|
|
# but the second chunk is the full 5 characters
|
rfctr(chunking): split oversized chunks on word boundary (#2297)
The text of an oversized chunk is split on an arbitrary character
boundary (mid-word). The `chunk_by_character()` strategy introduces the
idea of allowing the user to specify a separator to use for
chunk-splitting. For `langchain` this is typically "\n\n", "\n", or " ";
blank-line, newline, or word boundaries respectively.
Even if the user is allowed to specify a separator, we must provide
fall-back for when a chunk contains no such character. This can be done
incrementally, like blank-line is preferable to newline, newline is
preferable to word, and word is preferable to arbitrary character.
Further, there is nothing particular to `chunk_by_character()` in
providing such a fall-back text-splitting strategy. It would be
preferable for all strategies to split oversized chunks on even-word
boundaries for example.
Note that while a "blank-line" ("\n\n") may be common in plain text, it
is unlikely to appear in the text of an element because it would have
been interpreted as an element boundary during partitioning.
Add _TextSplitter with basic separator preferences and fall-back and
apply it to chunk-splitting for all strategies. The `by_character`
chunking strategy may enhance this behavior by adding the option for a
user to specify a particular separator suited to their use case.
2023-12-20 21:45:36 -08:00
|
|
|
|
assert len(partitioned_table_elements_5_chars[1].text) == 5
|
2023-10-03 09:40:34 -07:00
|
|
|
|
assert len(partitioned_table_elements_5_chars[0].metadata.text_as_html) == 5
|
|
|
|
|
|
|
|
|
|
# the first table element is under 200 chars so doesn't get chunked!
|
|
|
|
|
assert table_elements[0] == partitioned_table_elements_200_chars[0]
|
|
|
|
|
assert len(partitioned_table_elements_200_chars[0].text) < 200
|
2024-01-10 14:19:24 -08:00
|
|
|
|
assert len(partitioned_table_elements_200_chars[1].text) == 198
|
2023-10-03 09:40:34 -07:00
|
|
|
|
assert len(partitioned_table_elements_200_chars[1].metadata.text_as_html) == 200
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_add_chunking_strategy_chars_on_partition_auto_adds_is_continuation():
|
|
|
|
|
filename = "example-docs/example-10k-1p.html"
|
|
|
|
|
|
2023-10-09 12:42:36 -07:00
|
|
|
|
table_elements = [e for e in partition(filename) if isinstance(e, Table)]
|
|
|
|
|
chunked_table_elements = [
|
2023-10-03 09:40:34 -07:00
|
|
|
|
e
|
|
|
|
|
for e in partition(
|
|
|
|
|
filename,
|
2023-10-09 12:42:36 -07:00
|
|
|
|
chunking_strategy="by_title",
|
2023-10-03 09:40:34 -07:00
|
|
|
|
)
|
|
|
|
|
if isinstance(e, Table)
|
|
|
|
|
]
|
|
|
|
|
|
2023-10-09 12:42:36 -07:00
|
|
|
|
assert table_elements != chunked_table_elements
|
|
|
|
|
|
2023-10-03 09:40:34 -07:00
|
|
|
|
i = 0
|
2023-10-09 12:42:36 -07:00
|
|
|
|
for table in chunked_table_elements:
|
2023-10-03 09:40:34 -07:00
|
|
|
|
# have to reset the counter to 0 here when we encounter a Table element
|
|
|
|
|
if isinstance(table, Table):
|
|
|
|
|
i = 0
|
|
|
|
|
if i > 0 and isinstance(table, TableChunk):
|
|
|
|
|
assert table.metadata.is_continuation is True
|
|
|
|
|
i += 1
|
2023-10-10 20:47:56 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
EXAMPLE_LANG_DOCS = "example-docs/language-docs/eng_spa_mult."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"file_extension",
|
|
|
|
|
[
|
|
|
|
|
"doc",
|
|
|
|
|
"docx",
|
|
|
|
|
"eml",
|
|
|
|
|
"epub",
|
|
|
|
|
"html",
|
|
|
|
|
"md",
|
|
|
|
|
"odt",
|
|
|
|
|
"org",
|
|
|
|
|
"ppt",
|
|
|
|
|
"pptx",
|
|
|
|
|
"rst",
|
|
|
|
|
"rtf",
|
|
|
|
|
"txt",
|
|
|
|
|
"xml",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_partition_respects_language_arg(file_extension):
|
|
|
|
|
filename = EXAMPLE_LANG_DOCS + file_extension
|
|
|
|
|
elements = partition(filename=filename, languages=["deu"])
|
|
|
|
|
assert all(element.metadata.languages == ["deu"] for element in elements)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_respects_detect_language_per_element_arg():
|
|
|
|
|
filename = "example-docs/language-docs/eng_spa_mult.txt"
|
|
|
|
|
elements = partition(filename=filename, detect_language_per_element=True)
|
|
|
|
|
langs = [element.metadata.languages for element in elements]
|
|
|
|
|
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# check that the ["eng"] default in `partition` does not overwrite the ["auto"]
|
|
|
|
|
# default in other `partition_` functions.
|
|
|
|
|
def test_partition_default_does_not_overwrite_other_defaults():
|
|
|
|
|
# the default for `languages` is ["auto"] in partiton_text
|
|
|
|
|
from unstructured.partition.text import partition_text
|
|
|
|
|
|
|
|
|
|
# Use a document that is primarily in a language other than English
|
|
|
|
|
filename = "example-docs/language-docs/UDHR_first_article_all.txt"
|
|
|
|
|
text_elements = partition_text(filename)
|
|
|
|
|
assert text_elements[0].metadata.languages != ["eng"]
|
|
|
|
|
|
|
|
|
|
auto_elements = partition(filename)
|
|
|
|
|
assert auto_elements[0].metadata.languages != ["eng"]
|
|
|
|
|
assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages
|
2023-10-14 17:46:24 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_languages_default_to_None():
|
|
|
|
|
filename = "example-docs/handbook-1p.docx"
|
|
|
|
|
elements = partition(filename=filename, detect_language_per_element=True)
|
|
|
|
|
# PageBreak and other elements with no text will have `None` for `languages`
|
|
|
|
|
none_langs = [element for element in elements if element.metadata.languages is None]
|
|
|
|
|
assert none_langs[0].text == ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_languages_incorrectly_defaults_to_English(tmpdir):
|
|
|
|
|
# We don't totally rely on langdetect for short text, so text like the following that is
|
|
|
|
|
# in German will be labeled as English.
|
|
|
|
|
german = "Ein kurzer Satz."
|
|
|
|
|
filepath = os.path.join(tmpdir, "short-german.txt")
|
|
|
|
|
with open(filepath, "w") as f:
|
|
|
|
|
f.write(german)
|
|
|
|
|
elements = partition(filepath)
|
|
|
|
|
assert elements[0].metadata.languages == ["eng"]
|
2023-11-07 18:44:58 -06:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_timeout_gets_routed():
|
|
|
|
|
class CallException(Exception):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
mock_ocr_func = Mock(side_effect=CallException("Function called!"))
|
|
|
|
|
with patch("unstructured.partition.auto.file_and_type_from_url", mock_ocr_func), pytest.raises(
|
|
|
|
|
CallException
|
|
|
|
|
):
|
|
|
|
|
auto.partition(url="fake_url", request_timeout=326)
|
|
|
|
|
kwargs = mock_ocr_func.call_args.kwargs
|
|
|
|
|
assert "request_timeout" in kwargs
|
|
|
|
|
assert kwargs["request_timeout"] == 326
|
2024-01-17 17:50:36 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_with_bmp_with_auto(
|
|
|
|
|
tmpdir,
|
|
|
|
|
filename="example-docs/layout-parser-paper-with-table.jpg",
|
|
|
|
|
):
|
|
|
|
|
bmp_filename = os.path.join(tmpdir.dirname, "example.bmp")
|
|
|
|
|
img = Image.open(filename)
|
|
|
|
|
img.save(bmp_filename)
|
|
|
|
|
|
|
|
|
|
elements = partition(
|
|
|
|
|
filename=bmp_filename,
|
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
|
|
|
|
)
|
|
|
|
|
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
|
|
|
|
|
assert len(table) == 1
|
2024-06-14 11:11:38 -07:00
|
|
|
|
assert "<table><thead><tr>" in table[0]
|
|
|
|
|
assert "</thead><tbody><tr>" in table[0]
|
2024-02-07 17:31:49 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_eml_add_signature_to_metadata():
|
|
|
|
|
elements = partition(filename="example-docs/eml/signed-doc.p7s")
|
|
|
|
|
assert len(elements) == 1
|
|
|
|
|
assert elements[0].text == "This is a test"
|
|
|
|
|
assert elements[0].metadata.signature == "<SIGNATURE>\n"
|