unstructured/test_unstructured/partition/test_auto.py

from __future__ import annotations

import json
import os
import pathlib
import tempfile
import warnings
from importlib import import_module
from unittest.mock import Mock, patch

import docx
import pytest
from PIL import Image

from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
from test_unstructured.partition.test_constants import (
    EXPECTED_TABLE,
    EXPECTED_TABLE_XLSX,
    EXPECTED_TEXT,
    EXPECTED_TEXT_XLSX,
    EXPECTED_TITLE,
)
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import (
    Address,
    ElementMetadata,
    ListItem,
    NarrativeText,
    Table,
    TableChunk,
    Text,
    Title,
)
from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType
from unstructured.partition import auto
from unstructured.partition.auto import _get_partition_with_extras, partition
from unstructured.partition.common import convert_office_doc
from unstructured.partition.utils.constants import PartitionStrategy
from unstructured.staging.base import elements_to_json

DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")

EXPECTED_EMAIL_OUTPUT = [
    NarrativeText(text="This is a test email to use for unit tests."),
    Title(text="Important points:"),
    ListItem(text="Roses are red"),
    ListItem(text="Violets are blue"),
]

EML_TEST_FILE = "eml/fake-email.eml"

is_in_docker = os.path.exists("/.dockerenv")


def test_auto_partition_email_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements == EXPECTED_EMAIL_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]


def test_auto_partition_email_from_file():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
    with open(filename) as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements == EXPECTED_EMAIL_OUTPUT


def test_auto_partition_email_from_file_rb():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
    with open(filename, "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements == EXPECTED_EMAIL_OUTPUT


@pytest.fixture()
def mock_docx_document():
    document = docx.Document()

    document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
    # NOTE(robinson) - this should get picked up as a list item due to the •
    document.add_paragraph("• Parrots", style="Normal")
    document.add_paragraph("Hockey", style="List Bullet")
    # NOTE(robinson) - this should get picked up as a title
    document.add_paragraph("Analysis", style="Normal")
    # NOTE(robinson) - this should get dropped because it is empty
    document.add_paragraph("", style="Normal")
    # NOTE(robinson) - this should get picked up as a narrative text
    document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
    document.add_paragraph("This is my third thought.", style="Body Text")
    # NOTE(robinson) - this should just be regular text
    document.add_paragraph("2023")

    return document


@pytest.fixture()
def expected_docx_elements():
    return [
        Title("These are a few of my favorite things:"),
        ListItem("Parrots"),
        ListItem("Hockey"),
        Title("Analysis"),
        NarrativeText("This is my first thought. This is my second thought."),
        NarrativeText("This is my third thought."),
        Text("2023"),
    ]


def test_auto_partition_docx_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    mock_docx_document.save(filename)

    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
    assert elements == expected_docx_elements
    assert elements[0].metadata.filename == os.path.basename(filename)


def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_elements, tmpdir):
    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    mock_docx_document.save(filename)

    with open(filename, "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert elements == expected_docx_elements


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
)
def test_auto_partition_doc_with_filename(
    mock_docx_document,
    expected_docx_elements,
    tmp_path: pathlib.Path,
    pass_metadata_filename,
    content_type,
):
    docx_file_path = str(tmp_path / "mock_document.docx")
    doc_file_path = str(tmp_path / "mock_document.doc")
    mock_docx_document.save(docx_file_path)
    convert_office_doc(docx_file_path, str(tmp_path), "doc")
    metadata_filename = doc_file_path if pass_metadata_filename else None
    elements = partition(
        filename=doc_file_path,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.HI_RES,
    )
    assert elements == expected_docx_elements
    assert elements[0].metadata.filename == "mock_document.doc"
    assert elements[0].metadata.file_directory == str(tmp_path)


# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
# determine that the file is an .doc document
@pytest.mark.xfail()
def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements, tmpdir):
    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
    mock_docx_document.save(docx_filename)
    convert_office_doc(docx_filename, tmpdir.dirname, "doc")

    with open(doc_filename, "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert elements == expected_docx_elements


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_filename(pass_metadata_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
    metadata_filename = filename if pass_metadata_filename else None
    elements = partition(
        filename=filename,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.HI_RES,
    )
    assert len(elements) > 0
    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_file(pass_metadata_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
    metadata_filename = filename if pass_metadata_filename else None
    with open(filename) as f:
        elements = partition(
            file=f,
            metadata_filename=metadata_filename,
            content_type=content_type,
            strategy=PartitionStrategy.HI_RES,
        )
    assert len(elements) > 0


def test_auto_partition_html_from_file_rb():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
    with open(filename, "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0


# NOTE(robinson) - skipping this test with docker image to avoid putting the
# test fixtures into the image
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements():
    """Test auto-processing an unstructured json output file by filename."""
    original_file_name = "spring-weather.html"
    json_file_path = (
        pathlib.Path(DIRECTORY).parents[1]
        / "test_unstructured_ingest"
        / "expected-structured-output"
        / "azure"
        / f"{original_file_name}.json"
    )
    with open(json_file_path) as json_f:
        expected_result = json.load(json_f)

    partitioning_result = json.loads(
        elements_to_json(
            partition(
                filename=json_file_path,
                # -- use the original file name to get the same element IDs (hashes) --
                metadata_filename=original_file_name,
                strategy=PartitionStrategy.HI_RES,
            )
        )
    )
    for elem in partitioning_result:
        elem.pop("metadata")
    for elem in expected_result:
        elem.pop("metadata")
    assert expected_result == partitioning_result


def test_auto_partition_json_raises_with_unprocessable_json(tmpdir):
    # NOTE(robinson) - This is unprocessable because it is not a list of dicts,
    # per the Unstructured ISD format
    text = '{"hi": "there"}'

    filename = os.path.join(tmpdir, "unprocessable.json")
    with open(filename, "w") as f:
        f.write(text)

    with pytest.raises(ValueError):
        partition(filename=filename)


@pytest.mark.xfail(
    reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492",
)
def test_auto_partition_json_from_file():
    """Test auto-processing an unstructured json output file by file handle."""
    filename = os.path.join(
        EXAMPLE_DOCS_DIRECTORY,
        "..",
        "test_unstructured_ingest",
        "expected-structured-output",
        "azure-blob-storage",
        "spring-weather.html.json",
    )
    with open(filename) as json_f:
        json_data = json.load(json_f)
    with open(filename, encoding="utf-8") as partition_f:
        json_elems = json.loads(
            elements_to_json(partition(file=partition_f, strategy=PartitionStrategy.HI_RES))
        )
    for elem in json_elems:
        # coordinates are always in the element data structures, even if None
        elem.pop("coordinates")
        elem.pop("coordinate_system")
    assert json_data == json_elems


EXPECTED_TEXT_OUTPUT = [
    NarrativeText(text="This is a test document to use for unit tests."),
    Address(text="Doylestown, PA 18901"),
    Title(text="Important points:"),
    ListItem(text="Hamburgers are delicious"),
    ListItem(text="Dogs are the best"),
    ListItem(text="I love fuzzy blankets"),
]


def test_auto_partition_text_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements == EXPECTED_TEXT_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]


def test_auto_partition_text_from_file():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    with open(filename) as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements == EXPECTED_TEXT_OUTPUT


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type, request):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    metadata_filename = filename if pass_metadata_filename else None

    elements = partition(
        filename=filename,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.HI_RES,
    )

    # NOTE(alan): Xfail since new model skips the word Zejiang
    request.applymarker(pytest.mark.xfail)

    idx = 3
    assert isinstance(elements[idx], Title)
    assert elements[idx].text.startswith("LayoutParser")

    assert elements[idx].metadata.filename == os.path.basename(filename)
    assert elements[idx].metadata.file_directory == os.path.split(filename)[0]

    idx += 1
    assert isinstance(elements[idx], NarrativeText)
    assert elements[idx].text.startswith("Zejiang Shen")


def test_auto_partition_pdf_uses_table_extraction():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    with patch(
        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
    ) as mock_process_file_with_model:
        partition(filename, pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES)
        assert mock_process_file_with_model.call_args[1]["infer_table_structure"]


def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")

    mock_return = [NarrativeText("Hello there!")]
    with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
        mock_partition_with_extras_map = {"pdf": mock_partition}
        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
        partition(filename=filename, strategy=PartitionStrategy.FAST)

    mock_partition.assert_called_once_with(
        filename=filename,
        file=None,
        url=None,
        strategy=PartitionStrategy.FAST,
        languages=None,
        metadata_filename=None,
        include_page_breaks=False,
        infer_table_structure=False,
        extract_images_in_pdf=False,
        extract_image_block_types=None,
        extract_image_block_output_dir=None,
        extract_image_block_to_payload=False,
        hi_res_model_name=None,
        date_from_file_object=False,
        starting_page_number=1,
    )


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, request):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    metadata_filename = filename if pass_metadata_filename else None

    with open(filename, "rb") as f:
        elements = partition(
            file=f,
            metadata_filename=metadata_filename,
            content_type=content_type,
            strategy=PartitionStrategy.HI_RES,
        )

    # NOTE(alan): Xfail since new model skips the word Zejiang
    request.applymarker(pytest.mark.xfail)

    idx = 3
    assert isinstance(elements[idx], Title)
    assert elements[idx].text.startswith("LayoutParser")

    idx += 1
    assert isinstance(elements[idx], NarrativeText)
    assert elements[idx].text.startswith("Zejiang Shen")


def test_auto_partition_formats_languages_for_tesseract():
    filename = "example-docs/chi_sim_image.jpeg"
    with patch(
        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
    ) as mock_process_file_with_ocr:
        partition(filename, strategy=PartitionStrategy.HI_RES, languages=["zh"])
        _, kwargs = mock_process_file_with_ocr.call_args_list[0]
        assert "ocr_languages" in kwargs
        assert kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"


def test_auto_partition_element_metadata_user_provided_languages():
    filename = "example-docs/chevron-page.pdf"
    elements = partition(filename=filename, strategy=PartitionStrategy.OCR_ONLY, languages=["eng"])
    assert elements[0].metadata.languages == ["eng"]


@pytest.mark.parametrize(
    ("languages", "ocr_languages"),
    [(["auto"], ""), (["eng"], "")],
)
def test_auto_partition_ignores_empty_string_for_ocr_languages(languages, ocr_languages):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "book-war-and-peace-1p.txt")
    elements = partition(
        filename=filename,
        strategy=PartitionStrategy.OCR_ONLY,
        ocr_languages=ocr_languages,
        languages=languages,
    )
    assert elements[0].metadata.languages == ["eng"]


def test_auto_partition_warns_with_ocr_languages(caplog):
    filename = "example-docs/chevron-page.pdf"
    partition(filename=filename, strategy=PartitionStrategy.HI_RES, ocr_languages="eng")
    assert "The ocr_languages kwarg will be deprecated" in caplog.text


def test_partition_pdf_doesnt_raise_warning():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    # NOTE(robinson): This is the recommended way to check that no warning is emitted,
    # per the pytest docs.
    # ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
    #      #additional-use-cases-of-warnings-in-tests
    with warnings.catch_warnings():
        warnings.simplefilter("error")
        partition(filename=filename, strategy=PartitionStrategy.HI_RES)


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_image(pass_metadata_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
    metadata_filename = filename if pass_metadata_filename else None
    elements = partition(
        filename=filename,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.AUTO,
    )

    # should be same result as test_partition_image_default_strategy_hi_res() in test_image.py
    title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
    idx = 2
    assert elements[idx].text == title
    assert elements[idx].metadata.coordinates is not None


@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_auto_partition_image_element_extraction(
    extract_image_block_to_payload,
    filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.jpg"),
):
    extract_image_block_types = ["Image", "Table"]

    with tempfile.TemporaryDirectory() as tmpdir:
        elements = partition(
            filename=filename,
            extract_image_block_types=extract_image_block_types,
            extract_image_block_to_payload=extract_image_block_to_payload,
            extract_image_block_output_dir=tmpdir,
        )

        assert_element_extraction(
            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
        )


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpg(pass_metadata_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
    metadata_filename = filename if pass_metadata_filename else None
    elements = partition(
        filename=filename,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.AUTO,
    )
    assert len(elements) > 0


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpg_from_file(pass_metadata_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
    metadata_filename = filename if pass_metadata_filename else None
    with open(filename, "rb") as f:
        elements = partition(
            file=f,
            metadata_filename=metadata_filename,
            content_type=content_type,
            strategy=PartitionStrategy.AUTO,
        )
    assert len(elements) > 0


def test_auto_partition_raises_with_bad_type(monkeypatch):
    monkeypatch.setattr(auto, "detect_filetype", lambda *args, **kwargs: None)
    with pytest.raises(ValueError):
        partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)


EXPECTED_PPTX_OUTPUT = [
    Title(text="Adding a Bullet Slide"),
    ListItem(text="Find the bullet slide layout"),
    ListItem(text="Use _TextFrame.text for first bullet"),
    ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
    NarrativeText(text="Here is a lot of text!"),
    NarrativeText(text="Here is some text in a text box!"),
]


def test_auto_partition_pptx_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
    assert elements == EXPECTED_PPTX_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_ppt_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
    assert elements == EXPECTED_PPTX_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]


def test_auto_with_page_breaks():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    elements = partition(
        filename=filename, include_page_breaks=True, strategy=PartitionStrategy.HI_RES
    )
    assert "PageBreak" in [elem.category for elem in elements]


def test_auto_partition_epub_from_filename():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")


def test_auto_partition_epub_from_file():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
    with open(filename, "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")


EXPECTED_MSG_OUTPUT = [
    NarrativeText(text="This is a test email to use for unit tests."),
    Title(text="Important points:"),
    ListItem(text="Roses are red"),
    ListItem(text="Violets are blue"),
]


def test_auto_partition_msg_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
    assert elements == EXPECTED_MSG_OUTPUT


def test_auto_partition_rtf_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
    assert elements[0] == Title("My First Heading")


def test_auto_partition_from_url():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
    elements = partition(url=url, content_type="text/plain", strategy=PartitionStrategy.HI_RES)
    assert elements[0] == Title("Apache License")
    assert elements[0].metadata.url == url


def test_auto_partition_from_url_with_rfc9110_content_type():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
    elements = partition(
        url=url, content_type="text/plain; charset=utf-8", strategy=PartitionStrategy.HI_RES
    )
    assert elements[0] == Title("Apache License")
    assert elements[0].metadata.url == url


def test_auto_partition_from_url_without_providing_content_type():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
    elements = partition(url=url, strategy=PartitionStrategy.HI_RES)
    assert elements[0] == Title("Apache License")
    assert elements[0].metadata.url == url


def test_partition_md_works_with_embedded_html():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
    elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)
    elements[0].text
    unstructured_found = False
    for element in elements:
        if "unstructured" in elements[0].text:
            unstructured_found = True
            break
    assert unstructured_found is True


def test_auto_partition_warns_if_header_set_and_not_url(caplog):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
    partition(
        filename=filename, headers={"Accept": "application/pdf"}, strategy=PartitionStrategy.HI_RES
    )
    assert caplog.records[0].levelname == "WARNING"


def test_auto_partition_works_with_unstructured_jsons():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
    assert elements[0].text == "News Around NOAA"


def test_auto_partition_works_with_unstructured_jsons_from_file():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
    with open(filename, "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert elements[0].text == "News Around NOAA"


def test_auto_partition_odt_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
    assert elements[0] == Title("Lorem ipsum dolor sit amet.")


def test_auto_partition_odt_from_file():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
    with open(filename, "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)

    assert elements[0] == Title("Lorem ipsum dolor sit amet.")


@pytest.mark.parametrize(
    ("content_type", "routing_func", "expected"),
    [
        ("text/csv", "csv", "text/csv"),
        ("text/html", "html", "text/html"),
        ("jdsfjdfsjkds", "pdf", None),
    ],
)
def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected, monkeypatch):
    with patch(
        f"unstructured.partition.auto.partition_{routing_func}",
        lambda *args, **kwargs: [Text("text 1"), Text("text 2")],
    ) as mock_partition:
        mock_partition_with_extras_map = {routing_func: mock_partition}
        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
        elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
    assert len(elements) == 2
    assert all(el.metadata.filetype == expected for el in elements)


@pytest.mark.parametrize(
    ("content_type", "expected"),
    [
        ("application/pdf", FILETYPE_TO_MIMETYPE[FileType.PDF]),
        (None, FILETYPE_TO_MIMETYPE[FileType.PDF]),
    ],
)
def test_auto_filetype_overrides_file_specific(content_type, expected, monkeypatch):
    pdf_metadata = ElementMetadata(filetype="imapdf")
    with patch(
        "unstructured.partition.auto.partition_pdf",
        lambda *args, **kwargs: [
            Text("text 1", metadata=pdf_metadata),
            Text("text 2", metadata=pdf_metadata),
        ],
    ) as mock_partition:
        mock_partition_with_extras_map = {"pdf": mock_partition}
        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
        elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
    assert len(elements) == 2
    assert all(el.metadata.filetype == expected for el in elements)


@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_auto_partition_pdf_element_extraction(
    extract_image_block_to_payload,
    filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.pdf"),
):
    extract_image_block_types = ["Image", "Table"]

    with tempfile.TemporaryDirectory() as tmpdir:
        elements = partition(
            filename=filename,
            extract_image_block_types=extract_image_block_types,
            extract_image_block_to_payload=extract_image_block_to_payload,
            extract_image_block_output_dir=tmpdir,
        )

        assert_element_extraction(
            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
        )


supported_filetypes = [
    _
    for _ in FileType
    if _
    not in (
        FileType.UNK,
        FileType.ZIP,
        FileType.XLS,
    )
]


FILETYPE_TO_MODULE = {
    FileType.JPG: "image",
    FileType.PNG: "image",
    FileType.HEIC: "image",
    FileType.TXT: "text",
    FileType.EML: "email",
}


@pytest.mark.parametrize("filetype", supported_filetypes)
def test_file_specific_produces_correct_filetype(filetype: FileType):
    if filetype in auto.IMAGE_FILETYPES or filetype in (FileType.WAV, FileType.EMPTY):
        pytest.skip()
    extension = filetype.name.lower()
    filetype_module = FILETYPE_TO_MODULE.get(filetype, extension)
    fun_name = "partition_" + filetype_module
    module = import_module(f"unstructured.partition.{filetype_module}")  # noqa
    fun = eval(f"module.{fun_name}")
    for file in pathlib.Path("example-docs").iterdir():
        if file.is_file() and file.suffix == f".{extension}":
            elements = fun(str(file))
            assert all(
                el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype]
                for el in elements
                if el.metadata.filetype is not None
            )
            break


def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"):
    elements = partition(filename=filename, xml_keep_tags=False, metadata_filename=filename)

    assert elements[0].text == "United States"
    assert elements[0].metadata.filename == "factbook.xml"


def test_auto_partition_xml_from_file(filename="example-docs/factbook.xml"):
    with open(filename, "rb") as f:
        elements = partition(file=f, xml_keep_tags=False)

    assert elements[0].text == "United States"


def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
    elements = partition(filename=filename, xml_keep_tags=True)

    assert "<leader>Joe Biden</leader>" in elements[0].text
    assert elements[0].metadata.filename == "factbook.xml"


def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
    with open(filename, "rb") as f:
        elements = partition(file=f, xml_keep_tags=True)

    assert "<leader>Joe Biden</leader>" in elements[0].text


EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"


def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
    elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])

    assert sum(isinstance(element, Table) for element in elements) == 2
    assert sum(isinstance(element, Title) for element in elements) == 2
    assert len(elements) == 4

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
    assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
    assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
    assert elements[1].metadata.page_number == 1
    assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE


@pytest.mark.parametrize(
    ("skip_infer_table_types", "filename", "has_text_as_html_field"),
    [
        (["xlsx"], "stanley-cups.xlsx", False),
        ([], "stanley-cups.xlsx", True),
        (["odt"], "fake.odt", False),
        ([], "fake.odt", True),
    ],
)
def test_auto_partition_respects_skip_infer_table_types(
    skip_infer_table_types,
    filename,
    has_text_as_html_field,
):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
    with open(filename, "rb") as f:
        table_elements = [
            e
            for e in partition(file=f, skip_infer_table_types=skip_infer_table_types)
            if isinstance(e, Table)
        ]
        for table_element in table_elements:
            table_element_has_text_as_html_field = (
                hasattr(table_element.metadata, "text_as_html")
                and table_element.metadata.text_as_html is not None
            )
        assert table_element_has_text_as_html_field == has_text_as_html_field


def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
    with open(filename, "rb") as f:
        elements = partition(file=f, include_header=False, skip_infer_table_types=[])

    assert sum(isinstance(element, Table) for element in elements) == 2
    assert sum(isinstance(element, Title) for element in elements) == 2
    assert len(elements) == 4

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
    assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
    assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
    assert elements[1].metadata.page_number == 1
    assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE


def test_auto_partition_respects_starting_page_number_argument_for_xlsx():
    elements = partition("example-docs/stanley-cups.xlsx", starting_page_number=3)
    assert elements[1].metadata.page_number == 3


EXPECTED_XLS_TEXT_LEN = 550


EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MC What is 2+2? 4 correct 3 incorrect MA What"

EXPECTED_XLS_TABLE = (
    """<table border="1" class="dataframe">
  <tbody>
    <tr>
      <td>MC</td>
      <td>What is 2+2?</td>
      <td>4</td>
      <td>correct</td>
      <td>3</td>
      <td>incorrect</td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>MA</td>
      <td>What C datatypes are 8 bits? (assume i386)</td>
      <td>int</td>
      <td></td>
      <td>float</td>
      <td></td>
      <td>double</td>
      <td></td>
      <td>char</td>
    </tr>
    <tr>
      <td>TF</td>
      <td>Bagpipes are awesome.</td>
      <td>true</td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>ESS</td>
      <td>How have the original Henry Hornbostel buildings """
    """influenced campus architecture and design in the last 30 years?</td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>ORD</td>
      <td>Rank the following in their order of operation.</td>
      <td>Parentheses</td>
      <td>Exponents</td>
      <td>Division</td>
      <td>Addition</td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>FIB</td>
      <td>The student activities fee is</td>
      <td>95</td>
      <td>dollars for students enrolled in</td>
      <td>19</td>
      <td>units or more,</td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>MAT</td>
      <td>Match the lower-case greek letter with its capital form.</td>
      <td>λ</td>
      <td>Λ</td>
      <td>α</td>
      <td>γ</td>
      <td>Γ</td>
      <td>φ</td>
      <td>Φ</td>
    </tr>
  </tbody>
</table>"""
)


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
    elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])

    assert sum(isinstance(element, Table) for element in elements) == 2
    assert len(elements) == 14

    assert clean_extra_whitespace(elements[0].text)[:45] == EXPECTED_XLS_INITIAL_45_CLEAN_TEXT
    # NOTE(crag): if the beautifulsoup4 package is installed, some (but not all) additional
    # whitespace is removed, so the expected text length is less than is the case
    # when beautifulsoup4 is *not* installed. E.g.
    # "\n\n\nMA\nWhat C datatypes are 8 bits" vs.
    # '\n  \n    \n      MA\n      What C datatypes are 8 bits?... "
    assert len(elements[0].text) == EXPECTED_XLS_TEXT_LEN
    assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
    elements = partition(filename=filename)

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == "text/csv"


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
    elements = partition(filename=filename)

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == "text/tsv"


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
    with open(filename, "rb") as f:
        elements = partition(file=f)

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert isinstance(elements[0], Table)
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == "text/csv"


def test_auto_partition_html_pre_from_file(filename="example-docs/fake-html-pre.htm"):
    elements = partition(filename=filename)

    assert len(elements) > 0
    assert "PageBreak" not in [elem.category for elem in elements]
    assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
    assert isinstance(elements[0], NarrativeText)
    assert elements[0].metadata.filetype == "text/html"
    assert elements[0].metadata.filename == "fake-html-pre.htm"


def test_auto_partition_works_on_empty_filename(filename="example-docs/empty.txt"):
    assert partition(filename=filename) == []


def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
    with open(filename, "rb") as f:
        assert partition(file=f) == []


def test_auto_partition_org_from_filename(filename="example-docs/README.org"):
    elements = partition(filename=filename)

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/org"


def test_auto_partition_org_from_file(filename="example-docs/README.org"):
    with open(filename, "rb") as f:
        elements = partition(file=f, content_type="text/org")

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/org"


def test_auto_partition_rst_from_filename(filename="example-docs/README.rst"):
    elements = partition(filename=filename)

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/x-rst"


def test_auto_partition_rst_from_file(filename="example-docs/README.rst"):
    with open(filename, "rb") as f:
        elements = partition(file=f, content_type="text/x-rst")

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/x-rst"


def test_auto_partition_metadata_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    with open(filename) as f:
        elements = partition(file=f, metadata_filename=filename)
    assert elements[0].metadata.filename == os.path.split(filename)[-1]


def test_auto_partition_warns_about_file_filename_deprecation(caplog):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    with open(filename) as f:
        elements = partition(file=f, file_filename=filename)
    assert elements[0].metadata.filename == os.path.split(filename)[-1]
    assert "WARNING" in caplog.text
    assert "The file_filename kwarg will be deprecated" in caplog.text


def test_auto_partition_raises_with_file_and_metadata_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    with open(filename) as f, pytest.raises(ValueError):
        partition(file=f, file_filename=filename, metadata_filename=filename)


def test_get_partition_with_extras_prompts_for_install_if_missing():
    partition_with_extras_map = {}
    with pytest.raises(ImportError) as exception_info:
        _get_partition_with_extras("pdf", partition_with_extras_map)

    msg = str(exception_info.value)
    assert 'Install the pdf dependencies with pip install "unstructured[pdf]"' in msg


def test_add_chunking_strategy_on_partition_auto():
    filename = "example-docs/example-10k-1p.html"
    elements = partition(filename)
    chunk_elements = partition(filename, chunking_strategy="by_title")
    chunks = chunk_by_title(elements)
    assert chunk_elements != elements
    assert chunk_elements == chunks


def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
    filename = "example-docs/example-10k-1p.html"

    # default chunk size in chars is 200
    partitioned_table_elements_200_chars = [
        e
        for e in partition(
            filename,
            chunking_strategy="by_title",
            max_characters=200,
            combine_text_under_n_chars=5,
        )
        if isinstance(e, (Table, TableChunk))
    ]

    partitioned_table_elements_5_chars = [
        e
        for e in partition(
            filename,
            chunking_strategy="by_title",
            max_characters=5,
            combine_text_under_n_chars=5,
        )
        if isinstance(e, (Table, TableChunk))
    ]

    elements = partition(filename)

    table_elements = [e for e in elements if isinstance(e, Table)]

    assert len(partitioned_table_elements_5_chars) != len(table_elements)
    assert len(partitioned_table_elements_200_chars) != len(table_elements)

    # trailing whitespace is stripped from the first chunk, leaving only a checkbox character
    assert len(partitioned_table_elements_5_chars[0].text) == 1
    # but the second chunk is the full 5 characters
    assert len(partitioned_table_elements_5_chars[1].text) == 5
    assert len(partitioned_table_elements_5_chars[0].metadata.text_as_html) == 5

    # the first table element is under 200 chars so doesn't get chunked!
    assert table_elements[0] == partitioned_table_elements_200_chars[0]
    assert len(partitioned_table_elements_200_chars[0].text) < 200
    assert len(partitioned_table_elements_200_chars[1].text) == 198
    assert len(partitioned_table_elements_200_chars[1].metadata.text_as_html) == 200


def test_add_chunking_strategy_chars_on_partition_auto_adds_is_continuation():
    filename = "example-docs/example-10k-1p.html"

    table_elements = [e for e in partition(filename) if isinstance(e, Table)]
    chunked_table_elements = [
        e
        for e in partition(
            filename,
            chunking_strategy="by_title",
        )
        if isinstance(e, Table)
    ]

    assert table_elements != chunked_table_elements

    i = 0
    for table in chunked_table_elements:
        # have to reset the counter to 0 here when we encounter a Table element
        if isinstance(table, Table):
            i = 0
        if i > 0 and isinstance(table, TableChunk):
            assert table.metadata.is_continuation is True
            i += 1


EXAMPLE_LANG_DOCS = "example-docs/language-docs/eng_spa_mult."


@pytest.mark.parametrize(
    "file_extension",
    [
        "doc",
        "docx",
        "eml",
        "epub",
        "html",
        "md",
        "odt",
        "org",
        "ppt",
        "pptx",
        "rst",
        "rtf",
        "txt",
        "xml",
    ],
)
def test_partition_respects_language_arg(file_extension):
    filename = EXAMPLE_LANG_DOCS + file_extension
    elements = partition(filename=filename, languages=["deu"])
    assert all(element.metadata.languages == ["deu"] for element in elements)


def test_partition_respects_detect_language_per_element_arg():
    filename = "example-docs/language-docs/eng_spa_mult.txt"
    elements = partition(filename=filename, detect_language_per_element=True)
    langs = [element.metadata.languages for element in elements]
    assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]


# check that the ["eng"] default in `partition` does not overwrite the ["auto"]
# default in other `partition_` functions.
def test_partition_default_does_not_overwrite_other_defaults():
    # the default for `languages` is ["auto"] in partiton_text
    from unstructured.partition.text import partition_text

    # Use a document that is primarily in a language other than English
    filename = "example-docs/language-docs/UDHR_first_article_all.txt"
    text_elements = partition_text(filename)
    assert text_elements[0].metadata.languages != ["eng"]

    auto_elements = partition(filename)
    assert auto_elements[0].metadata.languages != ["eng"]
    assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages


def test_partition_languages_default_to_None():
    filename = "example-docs/handbook-1p.docx"
    elements = partition(filename=filename, detect_language_per_element=True)
    # PageBreak and other elements with no text will have `None` for `languages`
    none_langs = [element for element in elements if element.metadata.languages is None]
    assert none_langs[0].text == ""


def test_partition_languages_incorrectly_defaults_to_English(tmpdir):
    # We don't totally rely on langdetect for short text, so text like the following that is
    # in German will be labeled as English.
    german = "Ein kurzer Satz."
    filepath = os.path.join(tmpdir, "short-german.txt")
    with open(filepath, "w") as f:
        f.write(german)
    elements = partition(filepath)
    assert elements[0].metadata.languages == ["eng"]


def test_partition_timeout_gets_routed():
    class CallException(Exception):
        pass

    mock_ocr_func = Mock(side_effect=CallException("Function called!"))
    with patch("unstructured.partition.auto.file_and_type_from_url", mock_ocr_func), pytest.raises(
        CallException
    ):
        auto.partition(url="fake_url", request_timeout=326)
    kwargs = mock_ocr_func.call_args.kwargs
    assert "request_timeout" in kwargs
    assert kwargs["request_timeout"] == 326


def test_partition_image_with_bmp_with_auto(
    tmpdir,
    filename="example-docs/layout-parser-paper-with-table.jpg",
):
    bmp_filename = os.path.join(tmpdir.dirname, "example.bmp")
    img = Image.open(filename)
    img.save(bmp_filename)

    elements = partition(
        filename=bmp_filename,
        strategy=PartitionStrategy.HI_RES,
    )
    table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
    assert len(table) == 1
    assert "<table><thead><tr>" in table[0]
    assert "</thead><tbody><tr>" in table[0]


def test_auto_partition_eml_add_signature_to_metadata():
    elements = partition(filename="example-docs/eml/signed-doc.p7s")
    assert len(elements) == 1
    assert elements[0].text == "This is a test"
    assert elements[0].metadata.signature == "<SIGNATURE>\n"
-												rfctr: flatten test_unstructured/partition (#3073)

**Summary**
Some partitioner test modules are placed in directories by themselves or
with one other test module. This unnecessarily obscures where to find
the test module corresponding to a partitiner.

Move partitioner test modules to mirror the directory structure of
`unstructured/partition`.
											
										
										
											2024-05-22 17:51:08 -07:00
+								from __future__ import annotations
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								import json
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								import os
 								import pathlib
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								import tempfile
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
+								import warnings
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								from importlib import import_module
-												Feat: return base64 encoded images for PDF's (#2310)

Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition

elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										
										
											2023-12-26 21:39:01 -08:00
+								from unittest.mock import Mock, patch
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								import docx
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								import pytest
-												enhancement: add support from bitmap images (#2414)

### Summary

Adds support for bitmap images (`.bmp`) in both file detection and
partitioning. Bitmap images will be processed with `partition_image`
just like JPGs and PNGs.

### Testing

```python
from unstructured.file_utils.filetype import detect_filetype
from unstructured.partition.auto import partition
from PIL import Image

filename = "example-docs/layout-parser-paper-with-table.jpg"
bmp_filename = "~/tmp/ayout-parser-paper-with-table.bmp"

img = Image.open(filename)
img.save(bmp_filename)

detect_filetype(filename=bmp_filename) # Should be FileType.BMP

elements = partition(filename=bmp_filename)
```
											
										
										
											2024-01-17 17:50:36 -05:00
+								from PIL import Image
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
-												fix: stop csv and tsv dropping the first line of the file (#1530)

The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.

Here is a snippet of code that demonstrates the current behavior and the
proposed fix

```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring

c1 = """
    Stanley Cups,,
    Team,Location,Stanley Cups
    Blues,STL,1
    Flyers,PHI,2
    Maple Leafs,TOR,13
    """

f = "./test.csv"
with open(f, 'w') as ff:
    ff.write(c1)
  
print("Suggested Improvement Keep First Line") 
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)

print("\n\nOriginal Looses First Line") 
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-17 00:59:35 +02:00
+								from test_unstructured.partition.test_constants import (
 								    EXPECTED_TABLE,
 								    EXPECTED_TABLE_XLSX,
 								    EXPECTED_TEXT,
 								    EXPECTED_TEXT_XLSX,
 								    EXPECTED_TITLE,
 								)
-												chunk_by_title decorator (#1304)

### Summary

Partial solution to #1185.
Related to #1222.
Creates decorator from `chunk_by_title` cleaning brick.
Breaks a document into sections based on the presence of Title elements.
Also starts a new section under the following conditions:

- If metadata changes, indicating a change in section or page or a
switch to processing attachments. If `multipage_sections=True`, sections
can span pages. `multipage_sections` defaults to True.
- If the length of the section exceeds `new_after_n_chars` characters.
The default is 1500. The **chunking function does not split individual
elements**, so it's possible for a section to exceed that threshold if
an individual element if over `new_after_n_chars characters`, which
could occur with a long NarrativeText element.

Combines sections under these conditions
- Sections under `combine_under_n_chars` characters are combined. The
default is 500.

### Testing

from unstructured.partition.html import partition_html

url = "https://understandingwar.org/backgrounder/russian-offensive-campaign-assessment-august-27-2023-0"
chunks = partition_html(url=url, chunking_strategy="by_title")

for chunk in chunks:
    print(chunk)
    print("\n\n" + "-"*80)
    input()

											
										
										
											2023-09-11 16:00:14 -05:00
+								from unstructured.chunking.title import chunk_by_title
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								from unstructured.cleaners.core import clean_extra_whitespace
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								from unstructured.documents.elements import (
 								    Address,
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								    ElementMetadata,
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ListItem,
 								    NarrativeText,
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								    Table,
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								    TableChunk,
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    Text,
 								    Title,
 								)
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								from unstructured.partition import auto
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								from unstructured.partition.auto import _get_partition_with_extras, partition
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
+								from unstructured.partition.common import convert_office_doc
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								from unstructured.partition.utils.constants import PartitionStrategy
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								from unstructured.staging.base import elements_to_json
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								DIRECTORY = pathlib.Path(__file__).parent.resolve()
 								EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								EXPECTED_EMAIL_OUTPUT = [
 								    NarrativeText(text="This is a test email to use for unit tests."),
 								    Title(text="Important points:"),
 								    ListItem(text="Roses are red"),
 								    ListItem(text="Violets are blue"),
 								]
-												Bug/635 unicode decode error eml (#739)

* Adds functionality to extract charset info from eml files
* Adds missed file-like object handling in detect_file_encoding
* Adds functionality to replace the MIME encodings for eml files with one of the
   common encodings if a unicode error occurs
* Organize the eml example files in the example-docs/eml directory

											
										
										
											2023-06-16 17:52:13 -07:00
+								EML_TEST_FILE = "eml/fake-email.eml"
-												fix: correct order of kwargs in pandoc (#421)

* fix: correct order of kwargs in pandoc

* only skip epub tests in Docker

* changelog

---------

Co-authored-by: Crag Wolfe <crag@unstructuredai.io>
Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-30 16:54:29 -04:00
+								is_in_docker = os.path.exists("/.dockerenv")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								def test_auto_partition_email_from_filename():
-												Bug/635 unicode decode error eml (#739)

* Adds functionality to extract charset info from eml files
* Adds missed file-like object handling in detect_file_encoding
* Adds functionality to replace the MIME encodings for eml files with one of the
   common encodings if a unicode error occurs
* Organize the eml example files in the example-docs/eml directory

											
										
										
											2023-06-16 17:52:13 -07:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_EMAIL_OUTPUT
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								def test_auto_partition_email_from_file():
-												Bug/635 unicode decode error eml (#739)

* Adds functionality to extract charset info from eml files
* Adds missed file-like object handling in detect_file_encoding
* Adds functionality to replace the MIME encodings for eml files with one of the
   common encodings if a unicode error occurs
* Organize the eml example files in the example-docs/eml directory

											
										
										
											2023-06-16 17:52:13 -07:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    with open(filename) as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_EMAIL_OUTPUT
 								def test_auto_partition_email_from_file_rb():
-												Bug/635 unicode decode error eml (#739)

* Adds functionality to extract charset info from eml files
* Adds missed file-like object handling in detect_file_encoding
* Adds functionality to replace the MIME encodings for eml files with one of the
   common encodings if a unicode error occurs
* Organize the eml example files in the example-docs/eml directory

											
										
										
											2023-06-16 17:52:13 -07:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    with open(filename, "rb") as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_EMAIL_OUTPUT
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								@pytest.fixture()
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								def mock_docx_document():
 								    document = docx.Document()
 								    document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
 								    # NOTE(robinson) - this should get picked up as a list item due to the •
 								    document.add_paragraph("• Parrots", style="Normal")
 								    document.add_paragraph("Hockey", style="List Bullet")
 								    # NOTE(robinson) - this should get picked up as a title
 								    document.add_paragraph("Analysis", style="Normal")
 								    # NOTE(robinson) - this should get dropped because it is empty
 								    document.add_paragraph("", style="Normal")
 								    # NOTE(robinson) - this should get picked up as a narrative text
 								    document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
 								    document.add_paragraph("This is my third thought.", style="Body Text")
 								    # NOTE(robinson) - this should just be regular text
 								    document.add_paragraph("2023")
 								    return document
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								@pytest.fixture()
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								def expected_docx_elements():
 								    return [
 								        Title("These are a few of my favorite things:"),
 								        ListItem("Parrots"),
 								        ListItem("Hockey"),
 								        Title("Analysis"),
 								        NarrativeText("This is my first thought. This is my second thought."),
 								        NarrativeText("This is my third thought."),
 								        Text("2023"),
 								    ]
 								def test_auto_partition_docx_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
 								    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
 								    mock_docx_document.save(filename)
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert elements == expected_docx_elements
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_elements, tmpdir):
 								    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
 								    mock_docx_document.save(filename)
 								    with open(filename, "rb") as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert elements == expected_docx_elements
-												rfctr: flatten test_unstructured/partition (#3073)

**Summary**
Some partitioner test modules are placed in directories by themselves or
with one other test module. This unnecessarily obscures where to find
the test module corresponding to a partitiner.

Move partitioner test modules to mirror the directory structure of
`unstructured/partition`.
											
										
										
											2024-05-22 17:51:08 -07:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
 								)
 								def test_auto_partition_doc_with_filename(
 								    mock_docx_document,
 								    expected_docx_elements,
-												rfctr: flatten test_unstructured/partition (#3073)

**Summary**
Some partitioner test modules are placed in directories by themselves or
with one other test module. This unnecessarily obscures where to find
the test module corresponding to a partitiner.

Move partitioner test modules to mirror the directory structure of
`unstructured/partition`.
											
										
										
											2024-05-22 17:51:08 -07:00
+								    tmp_path: pathlib.Path,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    pass_metadata_filename,
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    content_type,
 								):
-												rfctr: flatten test_unstructured/partition (#3073)

**Summary**
Some partitioner test modules are placed in directories by themselves or
with one other test module. This unnecessarily obscures where to find
the test module corresponding to a partitiner.

Move partitioner test modules to mirror the directory structure of
`unstructured/partition`.
											
										
										
											2024-05-22 17:51:08 -07:00
+								    docx_file_path = str(tmp_path / "mock_document.docx")
 								    doc_file_path = str(tmp_path / "mock_document.doc")
 								    mock_docx_document.save(docx_file_path)
 								    convert_office_doc(docx_file_path, str(tmp_path), "doc")
 								    metadata_filename = doc_file_path if pass_metadata_filename else None
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    elements = partition(
-												rfctr: flatten test_unstructured/partition (#3073)

**Summary**
Some partitioner test modules are placed in directories by themselves or
with one other test module. This unnecessarily obscures where to find
the test module corresponding to a partitiner.

Move partitioner test modules to mirror the directory structure of
`unstructured/partition`.
											
										
										
											2024-05-22 17:51:08 -07:00
+								        filename=doc_file_path,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								        metadata_filename=metadata_filename,
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								        content_type=content_type,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        strategy=PartitionStrategy.HI_RES,
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    )
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
+								    assert elements == expected_docx_elements
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == "mock_document.doc"
-												rfctr: flatten test_unstructured/partition (#3073)

**Summary**
Some partitioner test modules are placed in directories by themselves or
with one other test module. This unnecessarily obscures where to find
the test module corresponding to a partitiner.

Move partitioner test modules to mirror the directory structure of
`unstructured/partition`.
											
										
										
											2024-05-22 17:51:08 -07:00
+								    assert elements[0].metadata.file_directory == str(tmp_path)
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
 								# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
 								# determine that the file is an .doc document
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								@pytest.mark.xfail()
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
+								def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements, tmpdir):
 								    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
 								    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
 								    mock_docx_document.save(docx_filename)
 								    convert_office_doc(docx_filename, tmpdir.dirname, "doc")
 								    with open(doc_filename, "rb") as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
+								    assert elements == expected_docx_elements
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
 								)
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								def test_auto_partition_html_from_filename(pass_metadata_filename, content_type):
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    metadata_filename = filename if pass_metadata_filename else None
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(
 								        filename=filename,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								        metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        content_type=content_type,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        strategy=PartitionStrategy.HI_RES,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    )
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
 								)
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								def test_auto_partition_html_from_file(pass_metadata_filename, content_type):
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    metadata_filename = filename if pass_metadata_filename else None
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    with open(filename) as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(
 								            file=f,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								            metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								            content_type=content_type,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								            strategy=PartitionStrategy.HI_RES,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        )
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
 								def test_auto_partition_html_from_file_rb():
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    with open(filename, "rb") as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
-												build: wolfi base image for Dockerfile (#3016)

### Summary

Updates the `Dockerfile` to use the Chainguard `wolfi-base` image to
reduce CVEs. Also adds a step in the docker publish job that scans the
images and checks for CVEs before publishing. The job will fail if there
are high or critical vulnerabilities.

### Testing

Run `make docker-run-dev` and then `python3.11` once you're in. And that
point, you can try:

```python
from unstructured.partition.auto import partition
elements = partition(filename="example-docs/DA-1p.pdf", skip_infer_table_types=["pdf"])
elements
```

Stop the container once you're done.
											
										
										
											2024-05-15 18:53:15 -04:00
+								# NOTE(robinson) - skipping this test with docker image to avoid putting the
 								# test fixtures into the image
 								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements():
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								    """Test auto-processing an unstructured json output file by filename."""
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								    original_file_name = "spring-weather.html"
 								    json_file_path = (
 								        pathlib.Path(DIRECTORY).parents[1]
 								        / "test_unstructured_ingest"
 								        / "expected-structured-output"
 								        / "azure"
 								        / f"{original_file_name}.json"
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								    )
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								    with open(json_file_path) as json_f:
 								        expected_result = json.load(json_f)
 								    partitioning_result = json.loads(
 								        elements_to_json(
 								            partition(
 								                filename=json_file_path,
 								                # -- use the original file name to get the same element IDs (hashes) --
 								                metadata_filename=original_file_name,
 								                strategy=PartitionStrategy.HI_RES,
 								            )
 								        )
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    )
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								    for elem in partitioning_result:
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        elem.pop("metadata")
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								    for elem in expected_result:
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        elem.pop("metadata")
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								    assert expected_result == partitioning_result
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
-												refactor: simplifies JSON detection and add tests (#975)

* refactor json detection

* version and changelog

* fix mock in test
											
										
										
											2023-07-25 15:59:45 -04:00
+								def test_auto_partition_json_raises_with_unprocessable_json(tmpdir):
 								    # NOTE(robinson) - This is unprocessable because it is not a list of dicts,
 								    # per the Unstructured ISD format
 								    text = '{"hi": "there"}'
 								    filename = os.path.join(tmpdir, "unprocessable.json")
 								    with open(filename, "w") as f:
 								        f.write(text)
 								    with pytest.raises(ValueError):
 								        partition(filename=filename)
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								@pytest.mark.xfail(
 								    reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492",
 								)
 								def test_auto_partition_json_from_file():
 								    """Test auto-processing an unstructured json output file by file handle."""
 								    filename = os.path.join(
 								        EXAMPLE_DOCS_DIRECTORY,
 								        "..",
 								        "test_unstructured_ingest",
 								        "expected-structured-output",
 								        "azure-blob-storage",
 								        "spring-weather.html.json",
 								    )
 								    with open(filename) as json_f:
 								        json_data = json.load(json_f)
 								    with open(filename, encoding="utf-8") as partition_f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        json_elems = json.loads(
 								            elements_to_json(partition(file=partition_f, strategy=PartitionStrategy.HI_RES))
 								        )
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								    for elem in json_elems:
 								        # coordinates are always in the element data structures, even if None
 								        elem.pop("coordinates")
-												feat: coordinate systems (#774)

Added the CoordinateSystem class for tracking the system in which coordinates are represented, and changing the system if desired.
											
										
										
											2023-06-20 11:19:55 -05:00
+								        elem.pop("coordinate_system")
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								    assert json_data == json_elems
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
+								EXPECTED_TEXT_OUTPUT = [
 								    NarrativeText(text="This is a test document to use for unit tests."),
-												fix: cleanup from live `.docx` tests (#177)

* add env var for cap threshold; raise default threshold

* update docs and tests

* added check for ending in a comma

* update docs

* no caps check for all upper text

* capture Text in html and text

* check category in Text equality check

* lower case all caps before checking for verbs

* added check for us city/state/zip

* added address type

* add address to html

* add address to text

* fix for text tests; escape for large text segments

* refactor regex for readability

* update comment

* additional test for text with linebreaks

* update docs

* update changelog

* update elements docs

* remove old comment

* case -> cast

* type fix
											
										
										
											2023-01-26 10:52:25 -05:00
+								    Address(text="Doylestown, PA 18901"),
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
+								    Title(text="Important points:"),
 								    ListItem(text="Hamburgers are delicious"),
 								    ListItem(text="Dogs are the best"),
 								    ListItem(text="I love fuzzy blankets"),
 								]
 								def test_auto_partition_text_from_filename():
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_TEXT_OUTPUT
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
 								def test_auto_partition_text_from_file():
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    with open(filename) as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_TEXT_OUTPUT
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
 								)
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type, request):
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    metadata_filename = filename if pass_metadata_filename else None
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(
 								        filename=filename,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								        metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        content_type=content_type,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        strategy=PartitionStrategy.HI_RES,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    )
-												chore: return `Element` objects in `partition_pdf` and `partition_image` (#164)

* helper function to convert to element

* test for element types

* fix for healthcheck url

* version bump

* note on coordinates

* mention FigureCaption

* test_shared -> test_common

* add check boxes for checkbox template

* update changelog
											
										
										
											2023-01-19 09:29:28 -05:00
-												build: wolfi base image for Dockerfile (#3016)

### Summary

Updates the `Dockerfile` to use the Chainguard `wolfi-base` image to
reduce CVEs. Also adds a step in the docker publish job that scans the
images and checks for CVEs before publishing. The job will fail if there
are high or critical vulnerabilities.

### Testing

Run `make docker-run-dev` and then `python3.11` once you're in. And that
point, you can try:

```python
from unstructured.partition.auto import partition
elements = partition(filename="example-docs/DA-1p.pdf", skip_infer_table_types=["pdf"])
elements
```

Stop the container once you're done.
											
										
										
											2024-05-15 18:53:15 -04:00
+								    # NOTE(alan): Xfail since new model skips the word Zejiang
 								    request.applymarker(pytest.mark.xfail)
-												Fix/1209 tweak xycut ordering output (#1630)

Closes GH Issue #1209.

### Summary
- add swapped `xycut` sorting
- update `xycut` sorting evaluation script

PDFs:
-
[sbaa031.073.pdf](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7234218/pdf/sbaa031.073.pdf)
-
[multi-column-2p.pdf](https://github.com/Unstructured-IO/unstructured/files/12796147/multi-column-2p.pdf)
-
[11723901.pdf](https://github.com/Unstructured-IO/unstructured-inference/files/12360085/11723901.pdf)
### Testing
```
elements = partition_pdf("sbaa031.073.pdf", strategy="hi_res")
print("\n\n".join([str(el) for el in elements]))
```
### Evaluation
```
PYTHONPATH=. python examples/custom-layout-order/evaluate_xy_cut_sorting.py sbaa031.073.pdf hi_res xycut_only
```

											
										
										
											2023-10-05 00:41:38 -07:00
+								    idx = 3
 								    assert isinstance(elements[idx], Title)
 								    assert elements[idx].text.startswith("LayoutParser")
-												chore: return `Element` objects in `partition_pdf` and `partition_image` (#164)

* helper function to convert to element

* test for element types

* fix for healthcheck url

* version bump

* note on coordinates

* mention FigureCaption

* test_shared -> test_common

* add check boxes for checkbox template

* update changelog
											
										
										
											2023-01-19 09:29:28 -05:00
-												Fix/1209 tweak xycut ordering output (#1630)

Closes GH Issue #1209.

### Summary
- add swapped `xycut` sorting
- update `xycut` sorting evaluation script

PDFs:
-
[sbaa031.073.pdf](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7234218/pdf/sbaa031.073.pdf)
-
[multi-column-2p.pdf](https://github.com/Unstructured-IO/unstructured/files/12796147/multi-column-2p.pdf)
-
[11723901.pdf](https://github.com/Unstructured-IO/unstructured-inference/files/12360085/11723901.pdf)
### Testing
```
elements = partition_pdf("sbaa031.073.pdf", strategy="hi_res")
print("\n\n".join([str(el) for el in elements]))
```
### Evaluation
```
PYTHONPATH=. python examples/custom-layout-order/evaluate_xy_cut_sorting.py sbaa031.073.pdf hi_res xycut_only
```

											
										
										
											2023-10-05 00:41:38 -07:00
+								    assert elements[idx].metadata.filename == os.path.basename(filename)
 								    assert elements[idx].metadata.file_directory == os.path.split(filename)[0]
-												feat: add metadata tracking to document elements (#225)

* add metadata field to elements

* metadata tracking for pdf/image

* metadata for html

* update expected outputs

* metadata for the rest of the document types

* take out file metadata for now

* add url to tables

* added metadata to test_auto

* bump version

* added coordinates to __init__

* fix coordinates in tests
											
										
										
											2023-02-15 13:26:20 -05:00
-												Fix/1209 tweak xycut ordering output (#1630)

Closes GH Issue #1209.

### Summary
- add swapped `xycut` sorting
- update `xycut` sorting evaluation script

PDFs:
-
[sbaa031.073.pdf](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7234218/pdf/sbaa031.073.pdf)
-
[multi-column-2p.pdf](https://github.com/Unstructured-IO/unstructured/files/12796147/multi-column-2p.pdf)
-
[11723901.pdf](https://github.com/Unstructured-IO/unstructured-inference/files/12360085/11723901.pdf)
### Testing
```
elements = partition_pdf("sbaa031.073.pdf", strategy="hi_res")
print("\n\n".join([str(el) for el in elements]))
```
### Evaluation
```
PYTHONPATH=. python examples/custom-layout-order/evaluate_xy_cut_sorting.py sbaa031.073.pdf hi_res xycut_only
```

											
										
										
											2023-10-05 00:41:38 -07:00
+								    idx += 1
 								    assert isinstance(elements[idx], NarrativeText)
 								    assert elements[idx].text.startswith("Zejiang Shen")
-												build(deps): update inference version (#662)

Updated to the the latest version of unstructured-inference. detectron2 now gets implemented with onnxruntime, yay!

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
											
										
										
											2023-05-31 13:50:15 -05:00
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												feat: extract tables (#503)

Exposes table extraction through partition and partition_pdf.
											
										
										
											2023-04-21 12:01:29 -05:00
+								def test_auto_partition_pdf_uses_table_extraction():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
 								    with patch(
-												Refactor: support merging `extracted` layout with `inferred` layout (#2158)

### Summary
This PR is the second part of `pdfminer` refactor to move it from
`unstructured-inference` repo to `unstructured` repo, the first part is
done in
https://github.com/Unstructured-IO/unstructured-inference/pull/294. This
PR adds logic to merge the extracted layout with the inferred layout.

The updated workflow for the `hi_res` strategy:
* pass the document (as data/filename) to the `inference` repo to get
`inferred_layout` (DocumentLayout)
* pass the `inferred_layout` returned from the `inference` repo and the
document (as data/filename) to the `pdfminer_processing` module, which
first opens the document (create temp file/dir as needed), and splits
the document by pages
* if is_image is `True`, return the passed
inferred_layout(DocumentLayout)
  * if is_image is `False`:
* get extracted_layout (TextRegions) from the passed
document(data/filename) by pdfminer
* merge `extracted_layout` (TextRegions) with the passed
`inferred_layout` (DocumentLayout)
* return the `inferred_layout `(DocumentLayout) with updated elements
(all merged LayoutElements) as merged_layout (DocumentLayout)
* pass merged_layout and the document (as data/filename) to the `OCR`
module, which first opens the document (create temp file/dir as needed),
and splits the document by pages (convert PDF pages to image pages for
PDF file)

### Note
This PR also fixes issue #2164 by using functionality similar to the one
implemented in the `fast` strategy workflow when extracting elements by
`pdfminer`.

### TODO
* image extraction refactor to move it from `unstructured-inference`
repo to `unstructured` repo
* improving natural reading order by applying the current default
`xycut` sorting to the elements extracted by `pdfminer`
											
										
										
											2023-12-01 12:56:31 -08:00
+								        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
-												feat: extract tables (#503)

Exposes table extraction through partition and partition_pdf.
											
										
										
											2023-04-21 12:01:29 -05:00
+								    ) as mock_process_file_with_model:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        partition(filename, pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES)
-												fix: decide table extraction (#3090)

This PR aims to add backward compatibility for the deprecated
`pdf_infer_table_structure` parameter. A missing part of turning table
extraction for PDFs and Images off by default in
https://github.com/Unstructured-IO/unstructured/pull/3035, which was
turned on in https://github.com/Unstructured-IO/unstructured/pull/2588.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-05-23 13:37:15 -07:00
+								        assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
-												feat: extract tables (#503)

Exposes table extraction through partition and partition_pdf.
											
										
										
											2023-04-21 12:01:29 -05:00
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
 								    mock_return = [NarrativeText("Hello there!")]
 								    with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								        mock_partition_with_extras_map = {"pdf": mock_partition}
 								        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        partition(filename=filename, strategy=PartitionStrategy.FAST)
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
 								    mock_partition.assert_called_once_with(
 								        filename=filename,
 								        file=None,
 								        url=None,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        strategy=PartitionStrategy.FAST,
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
+								        languages=None,
-												Feat: return base64 encoded images for PDF's (#2310)

Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition

elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										
										
											2023-12-26 21:39:01 -08:00
+								        metadata_filename=None,
 								        include_page_breaks=False,
-												BREAKING CHANGE: revert table extraction off by default for PDFs and images (#3035)

### Summary

Closes #3021 . Turns table extraction for PDFs and images off by
default. The default behavior originally changed in #2588 . The reason
for reversion is that some users did not realize turning off table
extraction was an option and experience long processing times for PDFs
and images with the new default behavior.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
											
										
										
											2024-05-17 11:28:11 -04:00
+								        infer_table_structure=False,
-												Feat: return base64 encoded images for PDF's (#2310)

Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition

elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										
										
											2023-12-26 21:39:01 -08:00
+								        extract_images_in_pdf=False,
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								        extract_image_block_types=None,
 								        extract_image_block_output_dir=None,
 								        extract_image_block_to_payload=False,
-												chore: add hi_res_model_name kwarg (#2289)

Closes #2160 

Explicitly adds `hi_res_model_name` as kwarg to relevant functions and
notes that `model_name` is to be deprecated.

Testing:
```
from unstructured.partition.auto import partition
filename = "example-docs/DA-1p.pdf"
elements = partition(filename, strategy="hi_res", hi_res_model_name="yolox")
```

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Steve Canny <stcanny@gmail.com>
Co-authored-by: Christine Straub <christinemstraub@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-12-22 09:06:54 -06:00
+								        hi_res_model_name=None,
-												feat: introduce `date_from_file_object` parameter to partitions (#2563)

Introduce `date_from_file_object` to `partition*` functions, by default
set to `False`.
If set to `True` and file is provided via `file` parameter, partition
will attempt to infer last modified date from `file`'s contents
otherwise last modified metadata will be set to `None`.

---------

Co-authored-by: Filip Knefel <filip@unstructured.io>
Co-authored-by: Ronny H <138828701+ron-unstructured@users.noreply.github.com>
											
										
										
											2024-03-18 02:09:44 +01:00
+								        date_from_file_object=False,
-												Introduce `start_page` argument to partitioning functions that assign `element.metadata.page_number` (#2884)

This small change will be useful for users who partition only fragments
of their PDF documents.
It's a small step towards addressing this issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

Related PRs:
* https://github.com/Unstructured-IO/unstructured/pull/2842
* https://github.com/Unstructured-IO/unstructured/pull/2673
											
										
										
											2024-04-15 23:03:42 +02:00
+								        starting_page_number=1,
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								    )
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
 								)
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, request):
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    metadata_filename = filename if pass_metadata_filename else None
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(
 								            file=f,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								            metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								            content_type=content_type,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								            strategy=PartitionStrategy.HI_RES,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        )
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
-												build: wolfi base image for Dockerfile (#3016)

### Summary

Updates the `Dockerfile` to use the Chainguard `wolfi-base` image to
reduce CVEs. Also adds a step in the docker publish job that scans the
images and checks for CVEs before publishing. The job will fail if there
are high or critical vulnerabilities.

### Testing

Run `make docker-run-dev` and then `python3.11` once you're in. And that
point, you can try:

```python
from unstructured.partition.auto import partition
elements = partition(filename="example-docs/DA-1p.pdf", skip_infer_table_types=["pdf"])
elements
```

Stop the container once you're done.
											
										
										
											2024-05-15 18:53:15 -04:00
+								    # NOTE(alan): Xfail since new model skips the word Zejiang
 								    request.applymarker(pytest.mark.xfail)
-												Fix/1209 tweak xycut ordering output (#1630)

Closes GH Issue #1209.

### Summary
- add swapped `xycut` sorting
- update `xycut` sorting evaluation script

PDFs:
-
[sbaa031.073.pdf](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7234218/pdf/sbaa031.073.pdf)
-
[multi-column-2p.pdf](https://github.com/Unstructured-IO/unstructured/files/12796147/multi-column-2p.pdf)
-
[11723901.pdf](https://github.com/Unstructured-IO/unstructured-inference/files/12360085/11723901.pdf)
### Testing
```
elements = partition_pdf("sbaa031.073.pdf", strategy="hi_res")
print("\n\n".join([str(el) for el in elements]))
```
### Evaluation
```
PYTHONPATH=. python examples/custom-layout-order/evaluate_xy_cut_sorting.py sbaa031.073.pdf hi_res xycut_only
```

											
										
										
											2023-10-05 00:41:38 -07:00
+								    idx = 3
 								    assert isinstance(elements[idx], Title)
 								    assert elements[idx].text.startswith("LayoutParser")
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
-												Fix/1209 tweak xycut ordering output (#1630)

Closes GH Issue #1209.

### Summary
- add swapped `xycut` sorting
- update `xycut` sorting evaluation script

PDFs:
-
[sbaa031.073.pdf](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7234218/pdf/sbaa031.073.pdf)
-
[multi-column-2p.pdf](https://github.com/Unstructured-IO/unstructured/files/12796147/multi-column-2p.pdf)
-
[11723901.pdf](https://github.com/Unstructured-IO/unstructured-inference/files/12360085/11723901.pdf)
### Testing
```
elements = partition_pdf("sbaa031.073.pdf", strategy="hi_res")
print("\n\n".join([str(el) for el in elements]))
```
### Evaluation
```
PYTHONPATH=. python examples/custom-layout-order/evaluate_xy_cut_sorting.py sbaa031.073.pdf hi_res xycut_only
```

											
										
										
											2023-10-05 00:41:38 -07:00
+								    idx += 1
 								    assert isinstance(elements[idx], NarrativeText)
 								    assert elements[idx].text.startswith("Zejiang Shen")
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
-												chore: function to map between standard and Tesseract language codes (#1421)

### Summary
In order to convert between incompatible language codes from packages
used for OCR, this change adds a function to map between any standard
language codes and tesseract OCR specific codes. Users can input
language information to `languages` in any Tesseract-supported langcode
or any ISO 639 standard language code.

### Details
- Introduces the
[python-iso639](https://pypi.org/project/python-iso639/) package for
matching standard language codes. Recompiles all dependencies.
- If a language is not already supplied by the user as a Tesseract
specific langcode, supplies all possible script/orthography variants of
the language to the Tesseract OCR agent.

### Test
Added many unit tests for a variety of language combinations, special
cases, and variants. For general testing, call partition functions with
any lang codes in the languages parameter (Tesseract or standard).

for example,
```
from unstructured.partition.auto import partition

elements = partition(filename="example-docs/layout-parser-paper.pdf", strategy="hi_res", languages=["en", "chi"])
print("\n\n".join([str(el) for el in elements]))
```
should supply eng+chi_sim+chi_sim_vert+chi_tra+chi_tra_vert to Tesseract
											
										
										
											2023-09-18 11:42:02 -04:00
+								def test_auto_partition_formats_languages_for_tesseract():
 								    filename = "example-docs/chi_sim_image.jpeg"
 								    with patch(
-												Refactor: support merging `extracted` layout with `inferred` layout (#2158)

### Summary
This PR is the second part of `pdfminer` refactor to move it from
`unstructured-inference` repo to `unstructured` repo, the first part is
done in
https://github.com/Unstructured-IO/unstructured-inference/pull/294. This
PR adds logic to merge the extracted layout with the inferred layout.

The updated workflow for the `hi_res` strategy:
* pass the document (as data/filename) to the `inference` repo to get
`inferred_layout` (DocumentLayout)
* pass the `inferred_layout` returned from the `inference` repo and the
document (as data/filename) to the `pdfminer_processing` module, which
first opens the document (create temp file/dir as needed), and splits
the document by pages
* if is_image is `True`, return the passed
inferred_layout(DocumentLayout)
  * if is_image is `False`:
* get extracted_layout (TextRegions) from the passed
document(data/filename) by pdfminer
* merge `extracted_layout` (TextRegions) with the passed
`inferred_layout` (DocumentLayout)
* return the `inferred_layout `(DocumentLayout) with updated elements
(all merged LayoutElements) as merged_layout (DocumentLayout)
* pass merged_layout and the document (as data/filename) to the `OCR`
module, which first opens the document (create temp file/dir as needed),
and splits the document by pages (convert PDF pages to image pages for
PDF file)

### Note
This PR also fixes issue #2164 by using functionality similar to the one
implemented in the `fast` strategy workflow when extracting elements by
`pdfminer`.

### TODO
* image extraction refactor to move it from `unstructured-inference`
repo to `unstructured` repo
* improving natural reading order by applying the current default
`xycut` sorting to the elements extracted by `pdfminer`
											
										
										
											2023-12-01 12:56:31 -08:00
+								        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
-												Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)

## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".

The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
    *  OCR the entire image
    * merge the OCR layout with inferred page layout
 * if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process

This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
  
## Test
```
from unstructured.partition.auto import partition

entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2023-10-06 18:54:49 -04:00
+								    ) as mock_process_file_with_ocr:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        partition(filename, strategy=PartitionStrategy.HI_RES, languages=["zh"])
-												Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)

## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".

The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
    *  OCR the entire image
    * merge the OCR layout with inferred page layout
 * if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process

This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
  
## Test
```
from unstructured.partition.auto import partition

entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2023-10-06 18:54:49 -04:00
+								        _, kwargs = mock_process_file_with_ocr.call_args_list[0]
 								        assert "ocr_languages" in kwargs
 								        assert kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
-												chore: function to map between standard and Tesseract language codes (#1421)

### Summary
In order to convert between incompatible language codes from packages
used for OCR, this change adds a function to map between any standard
language codes and tesseract OCR specific codes. Users can input
language information to `languages` in any Tesseract-supported langcode
or any ISO 639 standard language code.

### Details
- Introduces the
[python-iso639](https://pypi.org/project/python-iso639/) package for
matching standard language codes. Recompiles all dependencies.
- If a language is not already supplied by the user as a Tesseract
specific langcode, supplies all possible script/orthography variants of
the language to the Tesseract OCR agent.

### Test
Added many unit tests for a variety of language combinations, special
cases, and variants. For general testing, call partition functions with
any lang codes in the languages parameter (Tesseract or standard).

for example,
```
from unstructured.partition.auto import partition

elements = partition(filename="example-docs/layout-parser-paper.pdf", strategy="hi_res", languages=["en", "chi"])
print("\n\n".join([str(el) for el in elements]))
```
should supply eng+chi_sim+chi_sim_vert+chi_tra+chi_tra_vert to Tesseract
											
										
										
											2023-09-18 11:42:02 -04:00
-												feat: introduce language detection function for text partitioning function (#1453)

### Summary
Uses `langdetect` to detect all languages present in the input document.

### Details
- Converts all language codes (whether user inputted or detected using
`langdetect`) to a standard ISO 639-3 code.
- Adds `languages` field to the metadata
- Will revisit how to nonstandardly represent simplified vs traditional
Chinese scripts internally (separate PR).
- Update ingest test results to add `languages` field to documents. Some
other side effects are changes in order of some elements and changes in
element categorization

### Test
You can test the detect_languages function individually by importing the
function and inputting a text sample and optionally a language:
```
text = "My lubimy mleko i chleb."
doc_langs = detect_languages(text)
print(doc_langs)
```
-> ['ces', 'pol', 'slk']

---------

Co-authored-by: Newel H <37004249+newelh@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: shreyanid <shreyanid@users.noreply.github.com>
Co-authored-by: Trevor Bossert <37596773+tabossert@users.noreply.github.com>
Co-authored-by: Ronny H <138828701+ron-unstructured@users.noreply.github.com>
											
										
										
											2023-09-26 14:09:27 -04:00
+								def test_auto_partition_element_metadata_user_provided_languages():
 								    filename = "example-docs/chevron-page.pdf"
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    elements = partition(filename=filename, strategy=PartitionStrategy.OCR_ONLY, languages=["eng"])
-												feat: introduce language detection function for text partitioning function (#1453)

### Summary
Uses `langdetect` to detect all languages present in the input document.

### Details
- Converts all language codes (whether user inputted or detected using
`langdetect`) to a standard ISO 639-3 code.
- Adds `languages` field to the metadata
- Will revisit how to nonstandardly represent simplified vs traditional
Chinese scripts internally (separate PR).
- Update ingest test results to add `languages` field to documents. Some
other side effects are changes in order of some elements and changes in
element categorization

### Test
You can test the detect_languages function individually by importing the
function and inputting a text sample and optionally a language:
```
text = "My lubimy mleko i chleb."
doc_langs = detect_languages(text)
print(doc_langs)
```
-> ['ces', 'pol', 'slk']

---------

Co-authored-by: Newel H <37004249+newelh@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: shreyanid <shreyanid@users.noreply.github.com>
Co-authored-by: Trevor Bossert <37596773+tabossert@users.noreply.github.com>
Co-authored-by: Ronny H <138828701+ron-unstructured@users.noreply.github.com>
											
										
										
											2023-09-26 14:09:27 -04:00
+								    assert elements[0].metadata.languages == ["eng"]
-												fix languages 500 error with empty string for ocr_languages (#1968)

Closes #1870 
Defining both `languages` and `ocr_languages` raises a ValueError, but
the api defaults to `ocr_languages` being an empty string, so if users
define `languages` they are automatically hitting the ValueError.

This fix checks if `ocr_languages` is an empty string and converts it to
`None` to avoid this.

### Testing
On the main branch, the following will raise the ValueError, but it will
correctly partition on this branch
```
from unstructured.partition.auto import partition
filename = "example-docs/category-level.docx"
elements = partition(filename,languages=['spa'],ocr_languages="")

elements[0].metadata.languages
```

---------

Co-authored-by: yuming <305248291@qq.com>
Co-authored-by: Yuming Long <63475068+yuming-long@users.noreply.github.com>
Co-authored-by: Austin Walker <awalk89@gmail.com>
											
										
										
											2023-11-01 17:02:00 -05:00
+								@pytest.mark.parametrize(
 								    ("languages", "ocr_languages"),
 								    [(["auto"], ""), (["eng"], "")],
 								)
 								def test_auto_partition_ignores_empty_string_for_ocr_languages(languages, ocr_languages):
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "book-war-and-peace-1p.txt")
 								    elements = partition(
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        filename=filename,
 								        strategy=PartitionStrategy.OCR_ONLY,
 								        ocr_languages=ocr_languages,
 								        languages=languages,
-												fix languages 500 error with empty string for ocr_languages (#1968)

Closes #1870 
Defining both `languages` and `ocr_languages` raises a ValueError, but
the api defaults to `ocr_languages` being an empty string, so if users
define `languages` they are automatically hitting the ValueError.

This fix checks if `ocr_languages` is an empty string and converts it to
`None` to avoid this.

### Testing
On the main branch, the following will raise the ValueError, but it will
correctly partition on this branch
```
from unstructured.partition.auto import partition
filename = "example-docs/category-level.docx"
elements = partition(filename,languages=['spa'],ocr_languages="")

elements[0].metadata.languages
```

---------

Co-authored-by: yuming <305248291@qq.com>
Co-authored-by: Yuming Long <63475068+yuming-long@users.noreply.github.com>
Co-authored-by: Austin Walker <awalk89@gmail.com>
											
										
										
											2023-11-01 17:02:00 -05:00
+								    )
 								    assert elements[0].metadata.languages == ["eng"]
-												chore: refactor languages parameter for auto partition (#1400)

### Summary
In order to support language functionality other than Tesseract OCR, we
want to represent languages provided for either partitioning accuracy or
OCR as a standard list of langcodes as strings.

### Details
Follows the pattern established with PDFs in #1334. Adds languages (a
list of strings) as a parameter to partition in auto.py. Marks
ocr_languages for deprecation.

### Test
Call partition with a variety of filetypes (especially pdfs/images),
strategies, languages, or ocr_languages.
- inclusion of ocr_languages as a parameter should display a deprecation
warning and may proceed with partitioning if no other conflicts
- the other valid call outputs should be no different from the current
outputs
											
										
										
											2023-09-13 13:07:28 -04:00
+								def test_auto_partition_warns_with_ocr_languages(caplog):
 								    filename = "example-docs/chevron-page.pdf"
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    partition(filename=filename, strategy=PartitionStrategy.HI_RES, ocr_languages="eng")
-												chore: refactor languages parameter for auto partition (#1400)

### Summary
In order to support language functionality other than Tesseract OCR, we
want to represent languages provided for either partitioning accuracy or
OCR as a standard list of langcodes as strings.

### Details
Follows the pattern established with PDFs in #1334. Adds languages (a
list of strings) as a parameter to partition in auto.py. Marks
ocr_languages for deprecation.

### Test
Call partition with a variety of filetypes (especially pdfs/images),
strategies, languages, or ocr_languages.
- inclusion of ocr_languages as a parameter should display a deprecation
warning and may proceed with partitioning if no other conflicts
- the other valid call outputs should be no different from the current
outputs
											
										
										
											2023-09-13 13:07:28 -04:00
+								    assert "The ocr_languages kwarg will be deprecated" in caplog.text
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
+								def test_partition_pdf_doesnt_raise_warning():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
 								    # NOTE(robinson): This is the recommended way to check that no warning is emitted,
 								    # per the pytest docs.
 								    # ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
 								    #      #additional-use-cases-of-warnings-in-tests
 								    with warnings.catch_warnings():
 								        warnings.simplefilter("error")
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        partition(filename=filename, strategy=PartitionStrategy.HI_RES)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												Set default strategy for images to be "hi_res" (#968)

Set default strategy for images (not PDFs) to be hi_res.
											
										
										
											2023-08-02 09:22:20 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Set default strategy for images to be "hi_res" (#968)

Set default strategy for images (not PDFs) to be hi_res.
											
										
										
											2023-08-02 09:22:20 -07:00
+								    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 								)
-												fix: default hi_res model rely on inference setting (#2441)

- there are multiple places setting the default `hi_res_model_name` in
both `unstructured` and `unstructured-inference`
- they lead to inconsistency and unexpected behaviors
- this fix removes a helper in `unstructured` that tries to set the
default hi_res layout detection model; instead we rely on the
`unstructured-inference` to provide that default when no explicit model
name is passed in

## test

```bash
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true ipython
```

```python
from unstructured.partition.auto import partition

# find a pdf file
elements = partition("foo.pdf", strategy="hi_res")
assert elements[0].metadata.detection_origin == "yolox"
```

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: badGarnet <badGarnet@users.noreply.github.com>
											
										
										
											2024-01-29 10:44:41 -06:00
+								def test_auto_partition_image(pass_metadata_filename, content_type):
-												Set default strategy for images to be "hi_res" (#968)

Set default strategy for images (not PDFs) to be hi_res.
											
										
										
											2023-08-02 09:22:20 -07:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    metadata_filename = filename if pass_metadata_filename else None
-												Set default strategy for images to be "hi_res" (#968)

Set default strategy for images (not PDFs) to be hi_res.
											
										
										
											2023-08-02 09:22:20 -07:00
+								    elements = partition(
 								        filename=filename,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								        metadata_filename=metadata_filename,
-												Set default strategy for images to be "hi_res" (#968)

Set default strategy for images (not PDFs) to be hi_res.
											
										
										
											2023-08-02 09:22:20 -07:00
+								        content_type=content_type,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        strategy=PartitionStrategy.AUTO,
-												Set default strategy for images to be "hi_res" (#968)

Set default strategy for images (not PDFs) to be hi_res.
											
										
										
											2023-08-02 09:22:20 -07:00
+								    )
 								    # should be same result as test_partition_image_default_strategy_hi_res() in test_image.py
-												Fix/1209 tweak xycut ordering output (#1630)

Closes GH Issue #1209.

### Summary
- add swapped `xycut` sorting
- update `xycut` sorting evaluation script

PDFs:
-
[sbaa031.073.pdf](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7234218/pdf/sbaa031.073.pdf)
-
[multi-column-2p.pdf](https://github.com/Unstructured-IO/unstructured/files/12796147/multi-column-2p.pdf)
-
[11723901.pdf](https://github.com/Unstructured-IO/unstructured-inference/files/12360085/11723901.pdf)
### Testing
```
elements = partition_pdf("sbaa031.073.pdf", strategy="hi_res")
print("\n\n".join([str(el) for el in elements]))
```
### Evaluation
```
PYTHONPATH=. python examples/custom-layout-order/evaluate_xy_cut_sorting.py sbaa031.073.pdf hi_res xycut_only
```

											
										
										
											2023-10-05 00:41:38 -07:00
+								    title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
-												fix: default hi_res model rely on inference setting (#2441)

- there are multiple places setting the default `hi_res_model_name` in
both `unstructured` and `unstructured-inference`
- they lead to inconsistency and unexpected behaviors
- this fix removes a helper in `unstructured` that tries to set the
default hi_res layout detection model; instead we rely on the
`unstructured-inference` to provide that default when no explicit model
name is passed in

## test

```bash
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true ipython
```

```python
from unstructured.partition.auto import partition

# find a pdf file
elements = partition("foo.pdf", strategy="hi_res")
assert elements[0].metadata.detection_origin == "yolox"
```

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: badGarnet <badGarnet@users.noreply.github.com>
											
										
										
											2024-01-29 10:44:41 -06:00
+								    idx = 2
-												Fix/1209 tweak xycut ordering output (#1630)

Closes GH Issue #1209.

### Summary
- add swapped `xycut` sorting
- update `xycut` sorting evaluation script

PDFs:
-
[sbaa031.073.pdf](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7234218/pdf/sbaa031.073.pdf)
-
[multi-column-2p.pdf](https://github.com/Unstructured-IO/unstructured/files/12796147/multi-column-2p.pdf)
-
[11723901.pdf](https://github.com/Unstructured-IO/unstructured-inference/files/12360085/11723901.pdf)
### Testing
```
elements = partition_pdf("sbaa031.073.pdf", strategy="hi_res")
print("\n\n".join([str(el) for el in elements]))
```
### Evaluation
```
PYTHONPATH=. python examples/custom-layout-order/evaluate_xy_cut_sorting.py sbaa031.073.pdf hi_res xycut_only
```

											
										
										
											2023-10-05 00:41:38 -07:00
+								    assert elements[idx].text == title
 								    assert elements[idx].metadata.coordinates is not None
-												Set default strategy for images to be "hi_res" (#968)

Set default strategy for images (not PDFs) to be hi_res.
											
										
										
											2023-08-02 09:22:20 -07:00
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
 								def test_auto_partition_image_element_extraction(
 								    extract_image_block_to_payload,
 								    filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.jpg"),
 								):
 								    extract_image_block_types = ["Image", "Table"]
 								    with tempfile.TemporaryDirectory() as tmpdir:
 								        elements = partition(
 								            filename=filename,
 								            extract_image_block_types=extract_image_block_types,
 								            extract_image_block_to_payload=extract_image_block_to_payload,
 								            extract_image_block_output_dir=tmpdir,
 								        )
 								        assert_element_extraction(
 								            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
 								        )
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 								)
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								def test_auto_partition_jpg(pass_metadata_filename, content_type):
-												enhancement: auto strategy for PDFs and images (#578)

* added functions for determining auto stratgy

* change default strategy to auto

* tests for auto strategy

* update docs

* changelog and version

* bump version

* remove ingest file in wrong location

* update jpg output

* typo fix
											
										
										
											2023-05-12 13:45:08 -04:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    metadata_filename = filename if pass_metadata_filename else None
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(
 								        filename=filename,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								        metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        content_type=content_type,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        strategy=PartitionStrategy.AUTO,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    )
-												feat: partition image (#144)

Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
											
										
										
											2023-01-13 22:24:13 -06:00
+								    assert len(elements) > 0
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 								)
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								def test_auto_partition_jpg_from_file(pass_metadata_filename, content_type):
-												enhancement: auto strategy for PDFs and images (#578)

* added functions for determining auto stratgy

* change default strategy to auto

* tests for auto strategy

* update docs

* changelog and version

* bump version

* remove ingest file in wrong location

* update jpg output

* typo fix
											
										
										
											2023-05-12 13:45:08 -04:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    metadata_filename = filename if pass_metadata_filename else None
-												feat: partition image (#144)

Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
											
										
										
											2023-01-13 22:24:13 -06:00
+								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(
 								            file=f,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								            metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								            content_type=content_type,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								            strategy=PartitionStrategy.AUTO,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        )
-												feat: partition image (#144)

Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
											
										
										
											2023-01-13 22:24:13 -06:00
+								    assert len(elements) > 0
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								def test_auto_partition_raises_with_bad_type(monkeypatch):
 								    monkeypatch.setattr(auto, "detect_filetype", lambda *args, **kwargs: None)
 								    with pytest.raises(ValueError):
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
 								EXPECTED_PPTX_OUTPUT = [
 								    Title(text="Adding a Bullet Slide"),
 								    ListItem(text="Find the bullet slide layout"),
 								    ListItem(text="Use _TextFrame.text for first bullet"),
 								    ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
 								    NarrativeText(text="Here is a lot of text!"),
 								    NarrativeText(text="Here is some text in a text box!"),
 								]
 								def test_auto_partition_pptx_from_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    assert elements == EXPECTED_PPTX_OUTPUT
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: optional page breaks for `.pptx`, `.pdf`, `.html` and images (#205)

* page breaks for pptx

* added page breaks for image/pdf

* tests for images with page breaks

* page breaks for html documents

* linting, linting, linting

* changelog and bump version

* update docs

* fix typo

* refactor reusable code to common.py

* add type back in
											
										
										
											2023-02-08 10:11:15 -05:00
-												fix: correct order of kwargs in pandoc (#421)

* fix: correct order of kwargs in pandoc

* only skip epub tests in Docker

* changelog

---------

Co-authored-by: Crag Wolfe <crag@unstructuredai.io>
Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-30 16:54:29 -04:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-												feat: add `partition_ppt` for older power point docs (#238)

* added partition_ppt function and tests

* add ppt support to auto

* version bump

* update docs

* doc fixes

* update changelog

* `.docx` -> `.pptx`

* its -> their

* remove whitespace
											
										
										
											2023-02-17 11:57:08 -05:00
+								def test_auto_partition_ppt_from_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
-												feat: add `partition_ppt` for older power point docs (#238)

* added partition_ppt function and tests

* add ppt support to auto

* version bump

* update docs

* doc fixes

* update changelog

* `.docx` -> `.pptx`

* its -> their

* remove whitespace
											
										
										
											2023-02-17 11:57:08 -05:00
+								    assert elements == EXPECTED_PPTX_OUTPUT
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: add `partition_ppt` for older power point docs (#238)

* added partition_ppt function and tests

* add ppt support to auto

* version bump

* update docs

* doc fixes

* update changelog

* `.docx` -> `.pptx`

* its -> their

* remove whitespace
											
										
										
											2023-02-17 11:57:08 -05:00
-												feat: optional page breaks for `.pptx`, `.pdf`, `.html` and images (#205)

* page breaks for pptx

* added page breaks for image/pdf

* tests for images with page breaks

* page breaks for html documents

* linting, linting, linting

* changelog and bump version

* update docs

* fix typo

* refactor reusable code to common.py

* add type back in
											
										
										
											2023-02-08 10:11:15 -05:00
+								def test_auto_with_page_breaks():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    elements = partition(
 								        filename=filename, include_page_breaks=True, strategy=PartitionStrategy.HI_RES
 								    )
-												enhancement: clean pdf elements (bump unstructured-inference) (#790)

More deterministic element ordering when using hi_res PDF parsing strategy (from unstructured-inference bump to 0.5.4)
Make large model available (from unstructured-inference bump to 0.5.3)
Combine inferred elements with extracted elements (from unstructured-inference bump to 0.5.2)

---------

Co-authored-by: Roman Isecke <roman@unstructured.io>
Co-authored-by: Crag Wolfe <crag@unstructured.io>
											
										
										
											2023-06-29 20:35:06 -05:00
+								    assert "PageBreak" in [elem.category for elem in elements]
-												feat: add `partition_epub` function (#364)

* add pypandoc dependency

* added epub partitioner and file conversion

* test for partition_epub

* tests for file conversion

* add epub to filetype detection

* added epub to auto partition

* update bricks docs

* updated installing docs

* changelot and version

* add pandoc to dependencies

* add pandoc to debian dependencies

* linting, linting, linting

* typo fix

* typo fix

* file conversion type hints

* more type hints

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-03-14 11:52:21 -04:00
 								def test_auto_partition_epub_from_filename():
 								    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
-												feat: add `partition_epub` function (#364)

* add pypandoc dependency

* added epub partitioner and file conversion

* test for partition_epub

* tests for file conversion

* add epub to filetype detection

* added epub to auto partition

* update bricks docs

* updated installing docs

* changelot and version

* add pandoc to dependencies

* add pandoc to debian dependencies

* linting, linting, linting

* typo fix

* typo fix

* file conversion type hints

* more type hints

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-03-14 11:52:21 -04:00
+								    assert len(elements) > 0
 								    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
 								def test_auto_partition_epub_from_file():
 								    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
 								    with open(filename, "rb") as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												feat: add `partition_epub` function (#364)

* add pypandoc dependency

* added epub partitioner and file conversion

* test for partition_epub

* tests for file conversion

* add epub to filetype detection

* added epub to auto partition

* update bricks docs

* updated installing docs

* changelot and version

* add pandoc to dependencies

* add pandoc to debian dependencies

* linting, linting, linting

* typo fix

* typo fix

* file conversion type hints

* more type hints

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-03-14 11:52:21 -04:00
+								    assert len(elements) > 0
 								    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
-												feat: add `partition_msg` for MSFT Outlook files (#412)

* added msg-parser dependency

* pass through kwargs in convert_file_to_text

* added partition_msg for processing msft outlook files

* version bump and changelog

* added tests for partition_msg

* added test for msg with plain text

* add partition_msg docs; fix underlines in integration docs

* add .msg to file list

* finish tests for auto msg

* linting, linting, linting
											
										
										
											2023-03-28 16:15:22 -04:00
 								EXPECTED_MSG_OUTPUT = [
 								    NarrativeText(text="This is a test email to use for unit tests."),
 								    Title(text="Important points:"),
 								    ListItem(text="Roses are red"),
 								    ListItem(text="Violets are blue"),
 								]
 								def test_auto_partition_msg_from_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
-												feat: add `partition_msg` for MSFT Outlook files (#412)

* added msg-parser dependency

* pass through kwargs in convert_file_to_text

* added partition_msg for processing msft outlook files

* version bump and changelog

* added tests for partition_msg

* added test for msg with plain text

* add partition_msg docs; fix underlines in integration docs

* add .msg to file list

* finish tests for auto msg

* linting, linting, linting
											
										
										
											2023-03-28 16:15:22 -04:00
+								    assert elements == EXPECTED_MSG_OUTPUT
-												feat: add `partition_rtf` for rich text files (#466)

* refactor epub; add rtf

* added test for rtf files

* filetype detection for rtf files

* add rtf to auto

* update docs for group_broken_paragraphs

* add rtf to docs

* update file list in readme

* update stage_for_transformers docs

* changelog and version bump

* skip rtf if in docker

* skip test if rtf not supported

* docs tweaks
											
										
										
											2023-04-10 17:25:03 -04:00
 								def test_auto_partition_rtf_from_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
-												feat: add `partition_rtf` for rich text files (#466)

* refactor epub; add rtf

* added test for rtf files

* filetype detection for rtf files

* add rtf to auto

* update docs for group_broken_paragraphs

* add rtf to docs

* update file list in readme

* update stage_for_transformers docs

* changelog and version bump

* skip rtf if in docker

* skip test if rtf not supported

* docs tweaks
											
										
										
											2023-04-10 17:25:03 -04:00
+								    assert elements[0] == Title("My First Heading")
-												feat: add `url` kwarg to `partititon` (#470)

* added url option to auto partition

* add test for partition from url

* version and changelog

* update docs

* add url to element metadata
											
										
										
											2023-04-12 14:31:01 -04:00
 								def test_auto_partition_from_url():
 								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    elements = partition(url=url, content_type="text/plain", strategy=PartitionStrategy.HI_RES)
-												feat: add `url` kwarg to `partititon` (#470)

* added url option to auto partition

* add test for partition from url

* version and changelog

* update docs

* add url to element metadata
											
										
										
											2023-04-12 14:31:01 -04:00
+								    assert elements[0] == Title("Apache License")
 								    assert elements[0].metadata.url == url
-												feat: allow headers in `partition` (#473)

* feat: allow headers in `partition`

* warning if header is set and url is not

* update emoji test
											
										
										
											2023-04-13 11:04:15 -04:00
-												fix: parse URL response Content-Type according to RFC 9110 (#2950)

Currently, `file_and_type_from_url()` does not correctly handle the
`Content-Type` header. Specifically, it assumes that the header contains
only the mime-type (e.g. `text/html`), however, [RFC
9110](https://www.rfc-editor.org/rfc/rfc9110#field.content-type) allows
for additional directives — specifically the `charset` — to be returned
in the header. This leads to a `ValueError` when loading a URL with a
response Content-Type header such as `text/html; charset=UTF-8`.

To reproduce the issue:

```python
from unstructured.partition.auto import partition

url = "https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/"
partition(url=url)
```

Which will result in the following exception:

```python
{
	"name": "ValueError",
	"message": "Invalid file. The FileType.UNK file type is not supported in partition.",
	"stack": "---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[1], line 4
      1 from unstructured.partition.auto import partition
      3 url = \"https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/\"
----> 4 partition(url=url)

File ~/miniconda3/envs/ai-tasks/lib/python3.11/site-packages/unstructured/partition/auto.py:541, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, hi_res_model_name, model_name, date_from_file_object, starting_page_number, **kwargs)
    539 else:
    540     msg = \"Invalid file\" if not filename else f\"Invalid file {filename}\"
--> 541     raise ValueError(f\"{msg}. The {filetype} file type is not supported in partition.\")
    543 for element in elements:
    544     element.metadata.url = url

ValueError: Invalid file. The FileType.UNK file type is not supported in partition."
}
```

This PR fixes the issue by parsing the mime-type out of the
`Content-Type` header string.


Closes #2257
											
										
										
											2024-04-30 07:53:44 +02:00
+								def test_auto_partition_from_url_with_rfc9110_content_type():
 								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
 								    elements = partition(
 								        url=url, content_type="text/plain; charset=utf-8", strategy=PartitionStrategy.HI_RES
 								    )
 								    assert elements[0] == Title("Apache License")
 								    assert elements[0].metadata.url == url
 								def test_auto_partition_from_url_without_providing_content_type():
 								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
 								    elements = partition(url=url, strategy=PartitionStrategy.HI_RES)
 								    assert elements[0] == Title("Apache License")
 								    assert elements[0].metadata.url == url
-												fix: updates markdown code to process markdown with embedded html (#480)

* add carriage return to html if missing

* test on markdown with embedded html

* changelog and version

* check for html parser

* linting, linting, linting
											
										
										
											2023-04-13 12:47:45 -04:00
+								def test_partition_md_works_with_embedded_html():
 								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)
-												fix: updates markdown code to process markdown with embedded html (#480)

* add carriage return to html if missing

* test on markdown with embedded html

* changelog and version

* check for html parser

* linting, linting, linting
											
										
										
											2023-04-13 12:47:45 -04:00
+								    elements[0].text
 								    unstructured_found = False
 								    for element in elements:
 								        if "unstructured" in elements[0].text:
 								            unstructured_found = True
 								            break
 								    assert unstructured_found is True
-												feat: allow headers in `partition` (#473)

* feat: allow headers in `partition`

* warning if header is set and url is not

* update emoji test
											
										
										
											2023-04-13 11:04:15 -04:00
+								def test_auto_partition_warns_if_header_set_and_not_url(caplog):
-												Bug/635 unicode decode error eml (#739)

* Adds functionality to extract charset info from eml files
* Adds missed file-like object handling in detect_file_encoding
* Adds functionality to replace the MIME encodings for eml files with one of the
   common encodings if a unicode error occurs
* Organize the eml example files in the example-docs/eml directory

											
										
										
											2023-06-16 17:52:13 -07:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    partition(
 								        filename=filename, headers={"Accept": "application/pdf"}, strategy=PartitionStrategy.HI_RES
 								    )
-												feat: allow headers in `partition` (#473)

* feat: allow headers in `partition`

* warning if header is set and url is not

* update emoji test
											
										
										
											2023-04-13 11:04:15 -04:00
+								    assert caplog.records[0].levelname == "WARNING"
-												fix: update `detect_filetype` for JSONs with text/plain MIME type (#520)

* check to see if text file is a json

* add json check into filetype detection

* added test for updated file detection logic

* bytes/strings handling

* changlog and version bump
											
										
										
											2023-04-26 13:52:47 -04:00
 								def test_auto_partition_works_with_unstructured_jsons():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
-												fix: update `detect_filetype` for JSONs with text/plain MIME type (#520)

* check to see if text file is a json

* add json check into filetype detection

* added test for updated file detection logic

* bytes/strings handling

* changlog and version bump
											
										
										
											2023-04-26 13:52:47 -04:00
+								    assert elements[0].text == "News Around NOAA"
 								def test_auto_partition_works_with_unstructured_jsons_from_file():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
 								    with open(filename, "rb") as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												fix: update `detect_filetype` for JSONs with text/plain MIME type (#520)

* check to see if text file is a json

* add json check into filetype detection

* added test for updated file detection logic

* bytes/strings handling

* changlog and version bump
											
										
										
											2023-04-26 13:52:47 -04:00
+								    assert elements[0].text == "News Around NOAA"
-												feat: add `partition_odt` for open office docs (#548)

* added filetype detection for odt

* add function for partition odt documents

* add odt files to auto

* changelog and version

* docs and readme

* update installation docs

* skip tests if not supported or in docker

* import pytest

* fix docs typos
											
										
										
											2023-05-04 15:28:08 -04:00
 								def test_auto_partition_odt_from_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
-												chore: adding test case for odt tables (#1434)

ODT table extraction is happening! Just added to an existing example-doc
and an accompanying test case.
											
										
										
											2023-09-16 22:29:44 -07:00
+								    assert elements[0] == Title("Lorem ipsum dolor sit amet.")
-												feat: add `partition_odt` for open office docs (#548)

* added filetype detection for odt

* add function for partition odt documents

* add odt files to auto

* changelog and version

* docs and readme

* update installation docs

* skip tests if not supported or in docker

* import pytest

* fix docs typos
											
										
										
											2023-05-04 15:28:08 -04:00
 								def test_auto_partition_odt_from_file():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
 								    with open(filename, "rb") as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												feat: add `partition_odt` for open office docs (#548)

* added filetype detection for odt

* add function for partition odt documents

* add odt files to auto

* changelog and version

* docs and readme

* update installation docs

* skip tests if not supported or in docker

* import pytest

* fix docs typos
											
										
										
											2023-05-04 15:28:08 -04:00
-												chore: adding test case for odt tables (#1434)

ODT table extraction is happening! Just added to an existing example-doc
and an accompanying test case.
											
										
										
											2023-09-16 22:29:44 -07:00
+								    assert elements[0] == Title("Lorem ipsum dolor sit amet.")
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
 								@pytest.mark.parametrize(
 								    ("content_type", "routing_func", "expected"),
 								    [
-												refactor: simplifies JSON detection and add tests (#975)

* refactor json detection

* version and changelog

* fix mock in test
											
										
										
											2023-07-25 15:59:45 -04:00
+								        ("text/csv", "csv", "text/csv"),
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        ("text/html", "html", "text/html"),
 								        ("jdsfjdfsjkds", "pdf", None),
 								    ],
 								)
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected, monkeypatch):
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								    with patch(
 								        f"unstructured.partition.auto.partition_{routing_func}",
 								        lambda *args, **kwargs: [Text("text 1"), Text("text 2")],
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								    ) as mock_partition:
 								        mock_partition_with_extras_map = {routing_func: mock_partition}
 								        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
 								    assert len(elements) == 2
 								    assert all(el.metadata.filetype == expected for el in elements)
 								@pytest.mark.parametrize(
 								    ("content_type", "expected"),
 								    [
 								        ("application/pdf", FILETYPE_TO_MIMETYPE[FileType.PDF]),
 								        (None, FILETYPE_TO_MIMETYPE[FileType.PDF]),
 								    ],
 								)
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								def test_auto_filetype_overrides_file_specific(content_type, expected, monkeypatch):
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								    pdf_metadata = ElementMetadata(filetype="imapdf")
 								    with patch(
 								        "unstructured.partition.auto.partition_pdf",
 								        lambda *args, **kwargs: [
 								            Text("text 1", metadata=pdf_metadata),
 								            Text("text 2", metadata=pdf_metadata),
 								        ],
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								    ) as mock_partition:
 								        mock_partition_with_extras_map = {"pdf": mock_partition}
 								        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
 								    assert len(elements) == 2
 								    assert all(el.metadata.filetype == expected for el in elements)
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
 								def test_auto_partition_pdf_element_extraction(
 								    extract_image_block_to_payload,
 								    filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.pdf"),
 								):
 								    extract_image_block_types = ["Image", "Table"]
 								    with tempfile.TemporaryDirectory() as tmpdir:
 								        elements = partition(
 								            filename=filename,
 								            extract_image_block_types=extract_image_block_types,
 								            extract_image_block_to_payload=extract_image_block_to_payload,
 								            extract_image_block_output_dir=tmpdir,
 								        )
 								        assert_element_extraction(
 								            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
 								        )
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								supported_filetypes = [
 								    _
 								    for _ in FileType
 								    if _
 								    not in (
 								        FileType.UNK,
 								        FileType.ZIP,
 								        FileType.XLS,
 								    )
 								]
 								FILETYPE_TO_MODULE = {
 								    FileType.JPG: "image",
 								    FileType.PNG: "image",
-												feat: add support for partitioning .heic files (#2454)

.heic files are an image filetype we have not supported.

#### Testing
```
from unstructured.partition.image import partition_image

png_filename = "example-docs/DA-1p.png"
heic_filename = "example-docs/DA-1p.heic"

png_elements = partition_image(png_filename, strategy="hi_res")
heic_elements = partition_image(heic_filename, strategy="hi_res")

for i in range(len(heic_elements)):
	print(heic_elements[i].text == png_elements[i].text)
```

---------

Co-authored-by: christinestraub <christinemstraub@gmail.com>
											
										
										
											2024-01-29 22:49:00 -06:00
+								    FileType.HEIC: "image",
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								    FileType.TXT: "text",
 								    FileType.EML: "email",
 								}
 								@pytest.mark.parametrize("filetype", supported_filetypes)
 								def test_file_specific_produces_correct_filetype(filetype: FileType):
-												enhancement: add support from bitmap images (#2414)

### Summary

Adds support for bitmap images (`.bmp`) in both file detection and
partitioning. Bitmap images will be processed with `partition_image`
just like JPGs and PNGs.

### Testing

```python
from unstructured.file_utils.filetype import detect_filetype
from unstructured.partition.auto import partition
from PIL import Image

filename = "example-docs/layout-parser-paper-with-table.jpg"
bmp_filename = "~/tmp/ayout-parser-paper-with-table.bmp"

img = Image.open(filename)
img.save(bmp_filename)

detect_filetype(filename=bmp_filename) # Should be FileType.BMP

elements = partition(filename=bmp_filename)
```
											
										
										
											2024-01-17 17:50:36 -05:00
+								    if filetype in auto.IMAGE_FILETYPES or filetype in (FileType.WAV, FileType.EMPTY):
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        pytest.skip()
 								    extension = filetype.name.lower()
-												refactor: isolate ingest dependencies into local scopes (#2509)

This PR: 
- Moves ingest dependencies into local scopes to be able to import
ingest connector classes without the need of installing imported
external dependencies. This allows lightweight use of the classes (not
the instances. to use the instances as intended you'll still need the
dependencies).
- Upgrades the embed module dependencies from `langchain` to
`langchain-community` module (to pass CI [rather than introducing a
pin])
- Does pip-compile
- Does minor refactors in other files to pass `ruff 2.0` checks which
were introduced by pip-compile
											
										
										
											2024-02-06 21:28:55 +00:00
+								    filetype_module = FILETYPE_TO_MODULE.get(filetype, extension)
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								    fun_name = "partition_" + filetype_module
-												Refactor: importation consistency for `partition_pdf()` and `partition_image()` (#2282)

Closes #2278. This PR also removes the `extract_tables_in_pdf` mentioned
in issue #2280.
											
										
										
											2023-12-15 14:29:58 -08:00
+								    module = import_module(f"unstructured.partition.{filetype_module}")  # noqa
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								    fun = eval(f"module.{fun_name}")
 								    for file in pathlib.Path("example-docs").iterdir():
 								        if file.is_file() and file.suffix == f".{extension}":
 								            elements = fun(str(file))
-												enhancement: handling for empty files in `detect_filetype` and `partition` (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
											
										
										
											2023-06-09 16:07:50 -04:00
+								            assert all(
 								                el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype]
 								                for el in elements
 								                if el.metadata.filetype is not None
 								            )
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								            break
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
-												feat: add `partition_xml` for XML files (#596)

* first pass on partition_xml

* add option to keep xml tags

* added tests for xml

* fix filename

* update filenames

* remove outdated readme

* add xml to auto

* version and changelog

* update readme and docs

* pass through include_metadata

* update include_metadata description

* add README back in

* linting, linting, linting

* more linting

* spooled to bytes doesnt need to be a tuple

* Add tests for newly supported filetypes

* Correct metadata filetype

* doc typo

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* keep_xml_tags -> xml_keep_tags

---------

Co-authored-by: Alan Bertl <alan@unstructured.io>
Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-05-18 11:40:12 -04:00
+								def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"):
-												feat: `partition_xml` infers element type on each leaf node (#1249)

### Summary

Closes #1229. Updates `partition_xml` so that the element type is
inferred on each leaf node when `xml_keep_tags=False` instead of
delegating splitting and partitioning to `partition_xml`. If
`xml_keep_tags=True`, the file is treated like a text file still and
partitioning is still delegated to `partition_text`.

Also adds the option to pass `text` as an input to `partition_xml`.

### Testing

Create a `parrots.xml` file that looks like:

```xml
<xml><parrot><name>Conure</name><description>A conure is a very friendly bird.

Conures are feathery and like to dance.</description></parrot></xml>
```

Run:

```python
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import convert_to_dict

elements = partition_xml(filename="parrots.xml")
convert_to_dict(elements)
```

One `main`, the output is the following. Notice how the `<name>` tag
incorrectly gets merged into `<description>` in the first element.

```python
[{'element_id': '7ae4074435df8dfcefcf24a4e6c52026',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure A conure is a very friendly bird.',
  'type': 'NarrativeText'},
 {'element_id': '859ecb332da6961acd2fb6a0185d1549',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```

One the feature branch, the output is the following, and the tags are
correctly separated.

```python
[{'element_id': '5512218914e4eeacf71a9cd42c373710',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure',
  'type': 'Title'},
 {'element_id': '113bf8d250c2b1a77c9c2caa4b812f85',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'A conure is a very friendly bird.\n'
          '\n'
          'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```
											
										
										
											2023-08-30 17:07:10 -04:00
+								    elements = partition(filename=filename, xml_keep_tags=False, metadata_filename=filename)
-												feat: add `partition_xml` for XML files (#596)

* first pass on partition_xml

* add option to keep xml tags

* added tests for xml

* fix filename

* update filenames

* remove outdated readme

* add xml to auto

* version and changelog

* update readme and docs

* pass through include_metadata

* update include_metadata description

* add README back in

* linting, linting, linting

* more linting

* spooled to bytes doesnt need to be a tuple

* Add tests for newly supported filetypes

* Correct metadata filetype

* doc typo

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* keep_xml_tags -> xml_keep_tags

---------

Co-authored-by: Alan Bertl <alan@unstructured.io>
Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-05-18 11:40:12 -04:00
 								    assert elements[0].text == "United States"
 								    assert elements[0].metadata.filename == "factbook.xml"
 								def test_auto_partition_xml_from_file(filename="example-docs/factbook.xml"):
 								    with open(filename, "rb") as f:
 								        elements = partition(file=f, xml_keep_tags=False)
 								    assert elements[0].text == "United States"
 								def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
 								    elements = partition(filename=filename, xml_keep_tags=True)
-												feat: `partition_xml` infers element type on each leaf node (#1249)

### Summary

Closes #1229. Updates `partition_xml` so that the element type is
inferred on each leaf node when `xml_keep_tags=False` instead of
delegating splitting and partitioning to `partition_xml`. If
`xml_keep_tags=True`, the file is treated like a text file still and
partitioning is still delegated to `partition_text`.

Also adds the option to pass `text` as an input to `partition_xml`.

### Testing

Create a `parrots.xml` file that looks like:

```xml
<xml><parrot><name>Conure</name><description>A conure is a very friendly bird.

Conures are feathery and like to dance.</description></parrot></xml>
```

Run:

```python
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import convert_to_dict

elements = partition_xml(filename="parrots.xml")
convert_to_dict(elements)
```

One `main`, the output is the following. Notice how the `<name>` tag
incorrectly gets merged into `<description>` in the first element.

```python
[{'element_id': '7ae4074435df8dfcefcf24a4e6c52026',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure A conure is a very friendly bird.',
  'type': 'NarrativeText'},
 {'element_id': '859ecb332da6961acd2fb6a0185d1549',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```

One the feature branch, the output is the following, and the tags are
correctly separated.

```python
[{'element_id': '5512218914e4eeacf71a9cd42c373710',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure',
  'type': 'Title'},
 {'element_id': '113bf8d250c2b1a77c9c2caa4b812f85',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'A conure is a very friendly bird.\n'
          '\n'
          'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```
											
										
										
											2023-08-30 17:07:10 -04:00
+								    assert "<leader>Joe Biden</leader>" in elements[0].text
 								    assert elements[0].metadata.filename == "factbook.xml"
-												feat: add `partition_xml` for XML files (#596)

* first pass on partition_xml

* add option to keep xml tags

* added tests for xml

* fix filename

* update filenames

* remove outdated readme

* add xml to auto

* version and changelog

* update readme and docs

* pass through include_metadata

* update include_metadata description

* add README back in

* linting, linting, linting

* more linting

* spooled to bytes doesnt need to be a tuple

* Add tests for newly supported filetypes

* Correct metadata filetype

* doc typo

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* keep_xml_tags -> xml_keep_tags

---------

Co-authored-by: Alan Bertl <alan@unstructured.io>
Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-05-18 11:40:12 -04:00
 								def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
 								    with open(filename, "rb") as f:
 								        elements = partition(file=f, xml_keep_tags=True)
-												feat: `partition_xml` infers element type on each leaf node (#1249)

### Summary

Closes #1229. Updates `partition_xml` so that the element type is
inferred on each leaf node when `xml_keep_tags=False` instead of
delegating splitting and partitioning to `partition_xml`. If
`xml_keep_tags=True`, the file is treated like a text file still and
partitioning is still delegated to `partition_text`.

Also adds the option to pass `text` as an input to `partition_xml`.

### Testing

Create a `parrots.xml` file that looks like:

```xml
<xml><parrot><name>Conure</name><description>A conure is a very friendly bird.

Conures are feathery and like to dance.</description></parrot></xml>
```

Run:

```python
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import convert_to_dict

elements = partition_xml(filename="parrots.xml")
convert_to_dict(elements)
```

One `main`, the output is the following. Notice how the `<name>` tag
incorrectly gets merged into `<description>` in the first element.

```python
[{'element_id': '7ae4074435df8dfcefcf24a4e6c52026',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure A conure is a very friendly bird.',
  'type': 'NarrativeText'},
 {'element_id': '859ecb332da6961acd2fb6a0185d1549',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```

One the feature branch, the output is the following, and the tags are
correctly separated.

```python
[{'element_id': '5512218914e4eeacf71a9cd42c373710',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure',
  'type': 'Title'},
 {'element_id': '113bf8d250c2b1a77c9c2caa4b812f85',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'A conure is a very friendly bird.\n'
          '\n'
          'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```
											
										
										
											2023-08-30 17:07:10 -04:00
+								    assert "<leader>Joe Biden</leader>" in elements[0].text
-												feat: add `partition_xml` for XML files (#596)

* first pass on partition_xml

* add option to keep xml tags

* added tests for xml

* fix filename

* update filenames

* remove outdated readme

* add xml to auto

* version and changelog

* update readme and docs

* pass through include_metadata

* update include_metadata description

* add README back in

* linting, linting, linting

* more linting

* spooled to bytes doesnt need to be a tuple

* Add tests for newly supported filetypes

* Correct metadata filetype

* doc typo

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* keep_xml_tags -> xml_keep_tags

---------

Co-authored-by: Alan Bertl <alan@unstructured.io>
Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-05-18 11:40:12 -04:00
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 								def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
-												chore: fix infer_table bug (#1833)

Carrying `skip_infer_table_types` to `infer_table_structure` in
partition flow. Now PPT/X, DOC/X, etc. Table elements should not have a
`text_as_html` field.

Note: I've continued to exclude this var from partitioners that go
through html flow, I think if we've already got the html it doesn't make
sense to carry the infer variable along, since we're not 'infer-ing' the
html table in these cases.


TODO:
✅  add unit tests

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: amanda103 <amanda103@users.noreply.github.com>
											
										
										
											2023-10-23 17:11:53 -07:00
+								    elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
-												feat: xlsx subtable extraction (#1585)

**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.

**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text

**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">

```
[
    {
        "type": "Title",
        "element_id": "3315afd97f7f2ebcd450e7c939878429",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Financial performance"
    },
    {
        "type": "Table",
        "element_id": "17f5d512705be6f8812e5dbb801ba727",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "8a9db7161a02b427f8fda883656036e1",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Operational metrics"
    },
    {
        "type": "Table",
        "element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Other"
    },
    {
        "type": "Table",
        "element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
    }
]
```
											
										
										
											2023-10-04 13:30:23 -04:00
+								    assert sum(isinstance(element, Table) for element in elements) == 2
 								    assert sum(isinstance(element, Title) for element in elements) == 2
 								    assert len(elements) == 4
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
-												feat: xlsx subtable extraction (#1585)

**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.

**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text

**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">

```
[
    {
        "type": "Title",
        "element_id": "3315afd97f7f2ebcd450e7c939878429",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Financial performance"
    },
    {
        "type": "Table",
        "element_id": "17f5d512705be6f8812e5dbb801ba727",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "8a9db7161a02b427f8fda883656036e1",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Operational metrics"
    },
    {
        "type": "Table",
        "element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Other"
    },
    {
        "type": "Table",
        "element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
    }
]
```
											
										
										
											2023-10-04 13:30:23 -04:00
+								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
-												fix: stop csv and tsv dropping the first line of the file (#1530)

The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.

Here is a snippet of code that demonstrates the current behavior and the
proposed fix

```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring

c1 = """
    Stanley Cups,,
    Team,Location,Stanley Cups
    Blues,STL,1
    Flyers,PHI,2
    Maple Leafs,TOR,13
    """

f = "./test.csv"
with open(f, 'w') as ff:
    ff.write(c1)
  
print("Suggested Improvement Keep First Line") 
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)

print("\n\nOriginal Looses First Line") 
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-17 00:59:35 +02:00
+								    assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
 								    assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
-												feat: xlsx subtable extraction (#1585)

**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.

**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text

**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">

```
[
    {
        "type": "Title",
        "element_id": "3315afd97f7f2ebcd450e7c939878429",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Financial performance"
    },
    {
        "type": "Table",
        "element_id": "17f5d512705be6f8812e5dbb801ba727",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "8a9db7161a02b427f8fda883656036e1",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Operational metrics"
    },
    {
        "type": "Table",
        "element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Other"
    },
    {
        "type": "Table",
        "element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
    }
]
```
											
										
										
											2023-10-04 13:30:23 -04:00
+								    assert elements[1].metadata.page_number == 1
 								    assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
-												chore: fix infer_table bug (#1833)

Carrying `skip_infer_table_types` to `infer_table_structure` in
partition flow. Now PPT/X, DOC/X, etc. Table elements should not have a
`text_as_html` field.

Note: I've continued to exclude this var from partitioners that go
through html flow, I think if we've already got the html it doesn't make
sense to carry the infer variable along, since we're not 'infer-ing' the
html table in these cases.


TODO:
✅  add unit tests

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: amanda103 <amanda103@users.noreply.github.com>
											
										
										
											2023-10-23 17:11:53 -07:00
+								@pytest.mark.parametrize(
 								    ("skip_infer_table_types", "filename", "has_text_as_html_field"),
 								    [
 								        (["xlsx"], "stanley-cups.xlsx", False),
 								        ([], "stanley-cups.xlsx", True),
 								        (["odt"], "fake.odt", False),
 								        ([], "fake.odt", True),
 								    ],
 								)
 								def test_auto_partition_respects_skip_infer_table_types(
-												Chore: stop passing extract_tables to inference and note table regression on entire doc OCR (#1850)

### Summary

A follow up ticket on
https://github.com/Unstructured-IO/unstructured/pull/1801, I forgot to
remove the lines that pass extract_tables to inference, and noted the
table regression if we only do one OCR for entire doc

**Tech details:**
* stop passing `extract_tables` parameter to inference
* added table extraction ingest test for image, which was skipped
before, and the "text_as_html" field contains the OCR output from the
table OCR refactor PR
* replaced `assert_called_once_with` with `call_args` so that the unit
tests don't need to test additional parameters
* added `error_margin` as ENV when comparing bounding boxes
of`ocr_region` with `table_element`
* added more tests for tables and noted the table regression in test for
partition pdf

### Test
* for stop passing `extract_tables` parameter to inference, run test
`test_partition_pdf_hi_res_ocr_mode_with_table_extraction` before this
branch and you will see warning like `Table OCR from get_tokens method
will be deprecated....`, which means it called the table OCR in
inference repo. This branch removed the warning.
											
										
										
											2023-10-24 13:13:28 -04:00
+								    skip_infer_table_types,
 								    filename,
 								    has_text_as_html_field,
-												chore: fix infer_table bug (#1833)

Carrying `skip_infer_table_types` to `infer_table_structure` in
partition flow. Now PPT/X, DOC/X, etc. Table elements should not have a
`text_as_html` field.

Note: I've continued to exclude this var from partitioners that go
through html flow, I think if we've already got the html it doesn't make
sense to carry the infer variable along, since we're not 'infer-ing' the
html table in these cases.


TODO:
✅  add unit tests

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: amanda103 <amanda103@users.noreply.github.com>
											
										
										
											2023-10-23 17:11:53 -07:00
+								):
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
 								    with open(filename, "rb") as f:
 								        table_elements = [
 								            e
 								            for e in partition(file=f, skip_infer_table_types=skip_infer_table_types)
 								            if isinstance(e, Table)
 								        ]
 								        for table_element in table_elements:
 								            table_element_has_text_as_html_field = (
 								                hasattr(table_element.metadata, "text_as_html")
 								                and table_element.metadata.text_as_html is not None
 								            )
 								        assert table_element_has_text_as_html_field == has_text_as_html_field
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
 								    with open(filename, "rb") as f:
-												chore: fix infer_table bug (#1833)

Carrying `skip_infer_table_types` to `infer_table_structure` in
partition flow. Now PPT/X, DOC/X, etc. Table elements should not have a
`text_as_html` field.

Note: I've continued to exclude this var from partitioners that go
through html flow, I think if we've already got the html it doesn't make
sense to carry the infer variable along, since we're not 'infer-ing' the
html table in these cases.


TODO:
✅  add unit tests

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: amanda103 <amanda103@users.noreply.github.com>
											
										
										
											2023-10-23 17:11:53 -07:00
+								        elements = partition(file=f, include_header=False, skip_infer_table_types=[])
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
-												feat: xlsx subtable extraction (#1585)

**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.

**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text

**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">

```
[
    {
        "type": "Title",
        "element_id": "3315afd97f7f2ebcd450e7c939878429",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Financial performance"
    },
    {
        "type": "Table",
        "element_id": "17f5d512705be6f8812e5dbb801ba727",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "8a9db7161a02b427f8fda883656036e1",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Operational metrics"
    },
    {
        "type": "Table",
        "element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Other"
    },
    {
        "type": "Table",
        "element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
    }
]
```
											
										
										
											2023-10-04 13:30:23 -04:00
+								    assert sum(isinstance(element, Table) for element in elements) == 2
 								    assert sum(isinstance(element, Title) for element in elements) == 2
 								    assert len(elements) == 4
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
-												feat: xlsx subtable extraction (#1585)

**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.

**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text

**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">

```
[
    {
        "type": "Title",
        "element_id": "3315afd97f7f2ebcd450e7c939878429",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Financial performance"
    },
    {
        "type": "Table",
        "element_id": "17f5d512705be6f8812e5dbb801ba727",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "8a9db7161a02b427f8fda883656036e1",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Operational metrics"
    },
    {
        "type": "Table",
        "element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Other"
    },
    {
        "type": "Table",
        "element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
    }
]
```
											
										
										
											2023-10-04 13:30:23 -04:00
+								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
-												fix: stop csv and tsv dropping the first line of the file (#1530)

The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.

Here is a snippet of code that demonstrates the current behavior and the
proposed fix

```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring

c1 = """
    Stanley Cups,,
    Team,Location,Stanley Cups
    Blues,STL,1
    Flyers,PHI,2
    Maple Leafs,TOR,13
    """

f = "./test.csv"
with open(f, 'w') as ff:
    ff.write(c1)
  
print("Suggested Improvement Keep First Line") 
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)

print("\n\nOriginal Looses First Line") 
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-17 00:59:35 +02:00
+								    assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
 								    assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
-												feat: xlsx subtable extraction (#1585)

**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.

**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text

**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">

```
[
    {
        "type": "Title",
        "element_id": "3315afd97f7f2ebcd450e7c939878429",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Financial performance"
    },
    {
        "type": "Table",
        "element_id": "17f5d512705be6f8812e5dbb801ba727",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "8a9db7161a02b427f8fda883656036e1",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Operational metrics"
    },
    {
        "type": "Table",
        "element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Other"
    },
    {
        "type": "Table",
        "element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
    }
]
```
											
										
										
											2023-10-04 13:30:23 -04:00
+								    assert elements[1].metadata.page_number == 1
 								    assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
-												Introduce `start_page` argument to partitioning functions that assign `element.metadata.page_number` (#2884)

This small change will be useful for users who partition only fragments
of their PDF documents.
It's a small step towards addressing this issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

Related PRs:
* https://github.com/Unstructured-IO/unstructured/pull/2842
* https://github.com/Unstructured-IO/unstructured/pull/2673
											
										
										
											2024-04-15 23:03:42 +02:00
+								def test_auto_partition_respects_starting_page_number_argument_for_xlsx():
 								    elements = partition("example-docs/stanley-cups.xlsx", starting_page_number=3)
 								    assert elements[1].metadata.page_number == 3
-												feat: xlsx subtable extraction (#1585)

**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.

**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text

**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">

```
[
    {
        "type": "Title",
        "element_id": "3315afd97f7f2ebcd450e7c939878429",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Financial performance"
    },
    {
        "type": "Table",
        "element_id": "17f5d512705be6f8812e5dbb801ba727",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "8a9db7161a02b427f8fda883656036e1",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Operational metrics"
    },
    {
        "type": "Table",
        "element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Other"
    },
    {
        "type": "Table",
        "element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
    }
]
```
											
										
										
											2023-10-04 13:30:23 -04:00
+								EXPECTED_XLS_TEXT_LEN = 550
-												build(release): bump unstructured-inference (#1074)

* build(release): bump unstructured-inference

Related to downstream issue:
Unstructured-IO/unstructured-api#182

And upstream PR:
Unstructured-IO/unstructured-inference#165

---------

Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
											
										
										
											2023-08-10 13:57:46 -07:00
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
-												feat: xlsx subtable extraction (#1585)

**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.

**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text

**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">

```
[
    {
        "type": "Title",
        "element_id": "3315afd97f7f2ebcd450e7c939878429",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Financial performance"
    },
    {
        "type": "Table",
        "element_id": "17f5d512705be6f8812e5dbb801ba727",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "8a9db7161a02b427f8fda883656036e1",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Operational metrics"
    },
    {
        "type": "Table",
        "element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Other"
    },
    {
        "type": "Table",
        "element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
    }
]
```
											
										
										
											2023-10-04 13:30:23 -04:00
+								EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MC What is 2+2? 4 correct 3 incorrect MA What"
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
 								EXPECTED_XLS_TABLE = (
 								    """<table border="1" class="dataframe">
 								  <tbody>
-												feat: xlsx subtable extraction (#1585)

**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.

**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text

**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">

```
[
    {
        "type": "Title",
        "element_id": "3315afd97f7f2ebcd450e7c939878429",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Financial performance"
    },
    {
        "type": "Table",
        "element_id": "17f5d512705be6f8812e5dbb801ba727",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "8a9db7161a02b427f8fda883656036e1",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Operational metrics"
    },
    {
        "type": "Table",
        "element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Other"
    },
    {
        "type": "Table",
        "element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
    }
]
```
											
										
										
											2023-10-04 13:30:23 -04:00
+								    <tr>
 								      <td>MC</td>
 								      <td>What is 2+2?</td>
 								      <td>4</td>
 								      <td>correct</td>
 								      <td>3</td>
 								      <td>incorrect</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
+								    <tr>
 								      <td>MA</td>
 								      <td>What C datatypes are 8 bits? (assume i386)</td>
 								      <td>int</td>
 								      <td></td>
 								      <td>float</td>
 								      <td></td>
 								      <td>double</td>
 								      <td></td>
 								      <td>char</td>
 								    </tr>
 								    <tr>
 								      <td>TF</td>
 								      <td>Bagpipes are awesome.</td>
 								      <td>true</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>ESS</td>
 								      <td>How have the original Henry Hornbostel buildings """
 								    """influenced campus architecture and design in the last 30 years?</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>ORD</td>
 								      <td>Rank the following in their order of operation.</td>
 								      <td>Parentheses</td>
 								      <td>Exponents</td>
 								      <td>Division</td>
 								      <td>Addition</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>FIB</td>
 								      <td>The student activities fee is</td>
 								      <td>95</td>
 								      <td>dollars for students enrolled in</td>
 								      <td>19</td>
 								      <td>units or more,</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>MAT</td>
 								      <td>Match the lower-case greek letter with its capital form.</td>
 								      <td>λ</td>
 								      <td>Λ</td>
 								      <td>α</td>
 								      <td>γ</td>
 								      <td>Γ</td>
 								      <td>φ</td>
 								      <td>Φ</td>
 								    </tr>
 								  </tbody>
 								</table>"""
 								)
-												fix: extract emojis with `partition_xlsx` (#1009)

* 🐛 fixxed emoji xlsx bug

* update version and changelog

* check if beautifulsoup exists

* update docs

* fix html parser call

* fix failing attachment test

* ✅  added emoji test, added requirment fixed dependency

* 🐛 dependency

* 🐛 correct depeendency

* linting, linting, linting

* check for bs4

* skip auto xls filename test

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
											
										
										
											2023-08-04 16:14:08 +02:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
+								def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
-												chore: fix infer_table bug (#1833)

Carrying `skip_infer_table_types` to `infer_table_structure` in
partition flow. Now PPT/X, DOC/X, etc. Table elements should not have a
`text_as_html` field.

Note: I've continued to exclude this var from partitioners that go
through html flow, I think if we've already got the html it doesn't make
sense to carry the infer variable along, since we're not 'infer-ing' the
html table in these cases.


TODO:
✅  add unit tests

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: amanda103 <amanda103@users.noreply.github.com>
											
										
										
											2023-10-23 17:11:53 -07:00
+								    elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
-												feat: xlsx subtable extraction (#1585)

**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.

**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text

**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">

```
[
    {
        "type": "Title",
        "element_id": "3315afd97f7f2ebcd450e7c939878429",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Financial performance"
    },
    {
        "type": "Table",
        "element_id": "17f5d512705be6f8812e5dbb801ba727",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "8a9db7161a02b427f8fda883656036e1",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Operational metrics"
    },
    {
        "type": "Table",
        "element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Other"
    },
    {
        "type": "Table",
        "element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
    }
]
```
											
										
										
											2023-10-04 13:30:23 -04:00
+								    assert sum(isinstance(element, Table) for element in elements) == 2
-												fix(xlsx): xlsx subtable algorithm (#2534)

**Reviewers:** It may be easier to review each of the two commits
separately. The first adds the new `_SubtableParser` object with its
unit-tests and the second one uses that object to replace the flawed
existing subtable-parsing algorithm.

**Summary**

There are a cluster of bugs in `partition_xlsx()` that all derive from
flaws in the algorithm we use to detect "subtables". These are
encountered when the user wants to get multiple document-elements from
each worksheet, which is the default (argument `find_subtable = True`).

This PR replaces the flawed existing algorithm with a `_SubtableParser`
object that encapsulates all that logic and has thorough unit-tests.

**Additional Context**

This is a summary of the failure cases. There are a few other cases but
they're closely related and this was enough evidence and scope for my
purposes. This PR fixes all these bugs:
```python
    #
    # -- ✅ CASE 1: There are no leading or trailing single-cell rows.
    #       -> this subtable functions never get called, subtable is emitted as the only element
    #
    #    a b  -> Table(a, b, c, d)
    #    c d

    # -- ✅ CASE 2: There is exactly one leading single-cell row.
    #       -> Leading single-cell row emitted as `Title` element, core-table properly identified.
    #
    #    a    -> [ Title(a),
    #    b c       Table(b, c, d, e) ]
    #    d e

    # -- ❌ CASE 3: There are two-or-more leading single-cell rows.
    #       -> leading single-cell rows are included in subtable
    #
    #    a    -> [ Table(a, b, c, d, e, f) ]
    #    b
    #    c d
    #    e f

    # -- ❌ CASE 4: There is exactly one trailing single-cell row.
    #      -> core table is dropped. trailing single-cell row is emitted as Title
    #         (this is the behavior in the reported bug)
    #
    #    a b  -> [ Title(e) ]
    #    c d
    #      e

    # -- ❌ CASE 5: There are two-or-more trailing single-cell rows.
    #      -> core table is dropped. trailing single-cell rows are each emitted as a Title
    #
    #    a b  -> [ Title(e),
    #    c d       Title(f) ]
    #      e
    #      f

    # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows.
    #      -> core table is correctly identified, leading and trailing single-cell rows are each
    #         emitted as a Title.
    #
    #      a  -> [ Title(a),
    #    b c       Table(b, c, d, e),
    #    d e       Title(f) ]
    #    f

    # -- ✅ CASE 7: There are two leading and one trailing single-cell rows.
    #      -> core table is correctly identified, leading and trailing single-cell rows are each
    #         emitted as a Title.
    #
    #    a    -> [ Title(a),
    #    b         Title(b),
    #    c d       Table(c, d, e, f),
    #    e f       Title(g) ]
    #      g

    # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows.
    #      -> core table is correctly identified, leading and trailing single-cell rows are each
    #         emitted as a Title.
    #
    #      a  -> [ Title(a),
    #      b       Title(b),
    #    c d       Table(c, d, e, f),
    #    e f       Title(g),
    #    g         Title(h) ]
    #    h

    # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below.
    #      -> First cell is mistakenly emitted as title, remaining cells are dropped.
    #
    #    a b c  -> [ Title(a) ]

    # -- ❌ CASE 10: Single-row subtable with one leading single-cell row.
    #      -> Leading single-row cell is correctly identified as title, core-table is mis-identified
    #         as a `Title` and truncated.
    #
    #    a      -> [ Title(a),
    #    b c d       Title(b) ]
```
											
										
										
											2024-02-13 20:29:17 -08:00
+								    assert len(elements) == 14
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
 								    assert clean_extra_whitespace(elements[0].text)[:45] == EXPECTED_XLS_INITIAL_45_CLEAN_TEXT
-												build(release): bump unstructured-inference (#1074)

* build(release): bump unstructured-inference

Related to downstream issue:
Unstructured-IO/unstructured-api#182

And upstream PR:
Unstructured-IO/unstructured-inference#165

---------

Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
											
										
										
											2023-08-10 13:57:46 -07:00
+								    # NOTE(crag): if the beautifulsoup4 package is installed, some (but not all) additional
 								    # whitespace is removed, so the expected text length is less than is the case
 								    # when beautifulsoup4 is *not* installed. E.g.
 								    # "\n\n\nMA\nWhat C datatypes are 8 bits" vs.
 								    # '\n  \n    \n      MA\n      What C datatypes are 8 bits?... "
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
+								    assert len(elements[0].text) == EXPECTED_XLS_TEXT_LEN
 								    assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE
-												fix: add more mime types for csv (#620)


											
										
										
											2023-05-19 17:40:26 -04:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
+								def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
 								    elements = partition(filename=filename)
-												feat: partition_tsv for tab separated value files (#758)

* first pass at partition_tsv

* working tests

* create constants for tests and debug `make test` failure

* make check and tidy

* undo changes for testing locally

* update changelog and version

* fix bricks.rst

* refactor if statements

* make tidy

* fix README and change try/except to if/else

* update changelog and version

* fix\ docstring
											
										
										
											2023-06-15 13:50:53 -05:00
+								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 								    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
+								    assert elements[0].metadata.filetype == "text/csv"
-												Chore: Pass table support  param to partition image (#973)

* add param and test in image table extraction

* version and changelog

* need to publish this one for api repo

* add new param skip_infer_table_types

* use warning

* clean up with mapping

* add test for tsv

* fix test fail

* weird change from merge

* doc nit

* don't use mapping

* correct conflict
											
										
										
											2023-07-27 13:33:36 -04:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
 								def test_auto_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
 								    elements = partition(filename=filename)
 								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 								    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
 								    assert elements[0].metadata.filetype == "text/tsv"
-												fix: add more mime types for csv (#620)


											
										
										
											2023-05-19 17:40:26 -04:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
+								def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
 								    with open(filename, "rb") as f:
 								        elements = partition(file=f)
-												feat: partition_tsv for tab separated value files (#758)

* first pass at partition_tsv

* working tests

* create constants for tests and debug `make test` failure

* make check and tidy

* undo changes for testing locally

* update changelog and version

* fix bricks.rst

* refactor if statements

* make tidy

* fix README and change try/except to if/else

* update changelog and version

* fix\ docstring
											
										
										
											2023-06-15 13:50:53 -05:00
+								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
+								    assert isinstance(elements[0], Table)
-												feat: partition_tsv for tab separated value files (#758)

* first pass at partition_tsv

* working tests

* create constants for tests and debug `make test` failure

* make check and tidy

* undo changes for testing locally

* update changelog and version

* fix bricks.rst

* refactor if statements

* make tidy

* fix README and change try/except to if/else

* update changelog and version

* fix\ docstring
											
										
										
											2023-06-15 13:50:53 -05:00
+								    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
+								    assert elements[0].metadata.filetype == "text/csv"
-												enhancement: handling for empty files in `detect_filetype` and `partition` (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
											
										
										
											2023-06-09 16:07:50 -04:00
-												feature(html partition): parse pre tag (#642)

* feature(html partition): parse pre tag

* chore: update CHANGELOG.md

* style: black format xml.py

* Added tests dor html with pre tag

* remove skip test, update parse pre tag

* fix style

* chore: spell check

* chore: update changelog & version

* chore: update ingest test fixtures

* chore: add exception handling if `element.text` is `None` in `_read_xml`

* test: add more sanity testing on the `.text` content of the element(s)

* refactor: move the conditional logic for <pre> outside of the `try/except` block

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
											
										
										
											2023-06-27 21:52:39 +03:00
+								def test_auto_partition_html_pre_from_file(filename="example-docs/fake-html-pre.htm"):
 								    elements = partition(filename=filename)
 								    assert len(elements) > 0
-												Avoid setting metadata in constructor signature for elements (#837)

Avoid setting metadata in constructor signature for elements because that can lead to unexpected object reuse (and modification).

Bonus refactor for PageBreak to have text values of "".

---------

Co-authored-by: Alan Bertl <alan@unstructured.io>
Co-authored-by: Crag Wolfe <crag@unstructuredai.io>
											
										
										
											2023-06-28 23:14:05 -04:00
+								    assert "PageBreak" not in [elem.category for elem in elements]
-												fix: respect `<pre>` tag order in `partition_html` (#1197)

### Summary

Closes #1184. Updates `partition_html` to respect the ordering of
`<pre>` tags in HTML documents.

### Testing

The elements in the following example should be in the correct order.

```python
    from unstructured.partition.html import partition_html

    html_text = """
    <pre>The Big Brown Bear</pre>
    <div>The big brown bear is growling.</div>
    <pre>The big brown bear is sleeping.</pre>
    <div>The Big Blue Bear</div>
    """
    elements = partition_html(text=html_text)
    print("\n\n".join([str(el) for el in elements]))
```
											
										
										
											2023-08-25 00:14:48 -04:00
+								    assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
 								    assert isinstance(elements[0], NarrativeText)
-												feature(html partition): parse pre tag (#642)

* feature(html partition): parse pre tag

* chore: update CHANGELOG.md

* style: black format xml.py

* Added tests dor html with pre tag

* remove skip test, update parse pre tag

* fix style

* chore: spell check

* chore: update changelog & version

* chore: update ingest test fixtures

* chore: add exception handling if `element.text` is `None` in `_read_xml`

* test: add more sanity testing on the `.text` content of the element(s)

* refactor: move the conditional logic for <pre> outside of the `try/except` block

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
											
										
										
											2023-06-27 21:52:39 +03:00
+								    assert elements[0].metadata.filetype == "text/html"
 								    assert elements[0].metadata.filename == "fake-html-pre.htm"
-												enhancement: handling for empty files in `detect_filetype` and `partition` (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
											
										
										
											2023-06-09 16:07:50 -04:00
+								def test_auto_partition_works_on_empty_filename(filename="example-docs/empty.txt"):
 								    assert partition(filename=filename) == []
 								def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
 								    with open(filename, "rb") as f:
 								        assert partition(file=f) == []
-												feat: `partition_rst` for ReStructured Text documents (#725)

* add example rst file

* filetype detection for rst files

* add partition_rst function

* add partition_rst to auto

* update readme

* update docs

* changelog and version

* pandocs -> pandoc

* fix typo
											
										
										
											2023-06-12 15:31:10 -04:00
-												feat: partition_org for Org Mode documents (#780)

* feat: partition_org for Org Mode documents

* update version
											
										
										
											2023-06-23 20:45:31 +02:00
+								def test_auto_partition_org_from_filename(filename="example-docs/README.org"):
 								    elements = partition(filename=filename)
 								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/org"
 								def test_auto_partition_org_from_file(filename="example-docs/README.org"):
 								    with open(filename, "rb") as f:
 								        elements = partition(file=f, content_type="text/org")
 								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/org"
-												feat: `partition_rst` for ReStructured Text documents (#725)

* add example rst file

* filetype detection for rst files

* add partition_rst function

* add partition_rst to auto

* update readme

* update docs

* changelog and version

* pandocs -> pandoc

* fix typo
											
										
										
											2023-06-12 15:31:10 -04:00
+								def test_auto_partition_rst_from_filename(filename="example-docs/README.rst"):
 								    elements = partition(filename=filename)
 								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/x-rst"
 								def test_auto_partition_rst_from_file(filename="example-docs/README.rst"):
 								    with open(filename, "rb") as f:
 								        elements = partition(file=f, content_type="text/x-rst")
 								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/x-rst"
-												fixed filename metadata bug when using file and file_filename (#1002)


											
										
										
											2023-08-02 18:14:15 -07:00
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								def test_auto_partition_metadata_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
 								    with open(filename) as f:
 								        elements = partition(file=f, metadata_filename=filename)
 								    assert elements[0].metadata.filename == os.path.split(filename)[-1]
 								def test_auto_partition_warns_about_file_filename_deprecation(caplog):
-												fixed filename metadata bug when using file and file_filename (#1002)


											
										
										
											2023-08-02 18:14:15 -07:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
 								    with open(filename) as f:
 								        elements = partition(file=f, file_filename=filename)
 								    assert elements[0].metadata.filename == os.path.split(filename)[-1]
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    assert "WARNING" in caplog.text
 								    assert "The file_filename kwarg will be deprecated" in caplog.text
 								def test_auto_partition_raises_with_file_and_metadata_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
 								    with open(filename) as f, pytest.raises(ValueError):
 								        partition(file=f, file_filename=filename, metadata_filename=filename)
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
 								def test_get_partition_with_extras_prompts_for_install_if_missing():
 								    partition_with_extras_map = {}
 								    with pytest.raises(ImportError) as exception_info:
 								        _get_partition_with_extras("pdf", partition_with_extras_map)
 								    msg = str(exception_info.value)
 								    assert 'Install the pdf dependencies with pip install "unstructured[pdf]"' in msg
-												chunk_by_title decorator (#1304)

### Summary

Partial solution to #1185.
Related to #1222.
Creates decorator from `chunk_by_title` cleaning brick.
Breaks a document into sections based on the presence of Title elements.
Also starts a new section under the following conditions:

- If metadata changes, indicating a change in section or page or a
switch to processing attachments. If `multipage_sections=True`, sections
can span pages. `multipage_sections` defaults to True.
- If the length of the section exceeds `new_after_n_chars` characters.
The default is 1500. The **chunking function does not split individual
elements**, so it's possible for a section to exceed that threshold if
an individual element if over `new_after_n_chars characters`, which
could occur with a long NarrativeText element.

Combines sections under these conditions
- Sections under `combine_under_n_chars` characters are combined. The
default is 500.

### Testing

from unstructured.partition.html import partition_html

url = "https://understandingwar.org/backgrounder/russian-offensive-campaign-assessment-august-27-2023-0"
chunks = partition_html(url=url, chunking_strategy="by_title")

for chunk in chunks:
    print(chunk)
    print("\n\n" + "-"*80)
    input()

											
										
										
											2023-09-11 16:00:14 -05:00
 								def test_add_chunking_strategy_on_partition_auto():
 								    filename = "example-docs/example-10k-1p.html"
 								    elements = partition(filename)
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								    chunk_elements = partition(filename, chunking_strategy="by_title")
-												chunk_by_title decorator (#1304)

### Summary

Partial solution to #1185.
Related to #1222.
Creates decorator from `chunk_by_title` cleaning brick.
Breaks a document into sections based on the presence of Title elements.
Also starts a new section under the following conditions:

- If metadata changes, indicating a change in section or page or a
switch to processing attachments. If `multipage_sections=True`, sections
can span pages. `multipage_sections` defaults to True.
- If the length of the section exceeds `new_after_n_chars` characters.
The default is 1500. The **chunking function does not split individual
elements**, so it's possible for a section to exceed that threshold if
an individual element if over `new_after_n_chars characters`, which
could occur with a long NarrativeText element.

Combines sections under these conditions
- Sections under `combine_under_n_chars` characters are combined. The
default is 500.

### Testing

from unstructured.partition.html import partition_html

url = "https://understandingwar.org/backgrounder/russian-offensive-campaign-assessment-august-27-2023-0"
chunks = partition_html(url=url, chunking_strategy="by_title")

for chunk in chunks:
    print(chunk)
    print("\n\n" + "-"*80)
    input()

											
										
										
											2023-09-11 16:00:14 -05:00
+								    chunks = chunk_by_title(elements)
 								    assert chunk_elements != elements
 								    assert chunk_elements == chunks
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
 								    filename = "example-docs/example-10k-1p.html"
 								    # default chunk size in chars is 200
 								    partitioned_table_elements_200_chars = [
 								        e
 								        for e in partition(
 								            filename,
 								            chunking_strategy="by_title",
 								            max_characters=200,
 								            combine_text_under_n_chars=5,
 								        )
 								        if isinstance(e, (Table, TableChunk))
 								    ]
 								    partitioned_table_elements_5_chars = [
 								        e
 								        for e in partition(
 								            filename,
 								            chunking_strategy="by_title",
 								            max_characters=5,
 								            combine_text_under_n_chars=5,
 								        )
 								        if isinstance(e, (Table, TableChunk))
 								    ]
 								    elements = partition(filename)
 								    table_elements = [e for e in elements if isinstance(e, Table)]
 								    assert len(partitioned_table_elements_5_chars) != len(table_elements)
 								    assert len(partitioned_table_elements_200_chars) != len(table_elements)
-												feature(chunking): add basic strategy and overlap (#2367)

This PR culminates the restructuring of chunking over my prior
dozen-or-so commits by adding the new options to the API and
documentation.

Separately I'll be adding a new ingest test to defend against
regression, although the integration test included in this PR will do a
pretty good job of that too.
											
										
										
											2024-01-10 14:19:24 -08:00
+								    # trailing whitespace is stripped from the first chunk, leaving only a checkbox character
 								    assert len(partitioned_table_elements_5_chars[0].text) == 1
 								    # but the second chunk is the full 5 characters
-												rfctr(chunking): split oversized chunks on word boundary (#2297)

The text of an oversized chunk is split on an arbitrary character
boundary (mid-word). The `chunk_by_character()` strategy introduces the
idea of allowing the user to specify a separator to use for
chunk-splitting. For `langchain` this is typically "\n\n", "\n", or " ";
blank-line, newline, or word boundaries respectively.

Even if the user is allowed to specify a separator, we must provide
fall-back for when a chunk contains no such character. This can be done
incrementally, like blank-line is preferable to newline, newline is
preferable to word, and word is preferable to arbitrary character.

Further, there is nothing particular to `chunk_by_character()` in
providing such a fall-back text-splitting strategy. It would be
preferable for all strategies to split oversized chunks on even-word
boundaries for example.

Note that while a "blank-line" ("\n\n") may be common in plain text, it
is unlikely to appear in the text of an element because it would have
been interpreted as an element boundary during partitioning.

Add _TextSplitter with basic separator preferences and fall-back and
apply it to chunk-splitting for all strategies. The `by_character`
chunking strategy may enhance this behavior by adding the option for a
user to specify a particular separator suited to their use case.
											
										
										
											2023-12-20 21:45:36 -08:00
+								    assert len(partitioned_table_elements_5_chars[1].text) == 5
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								    assert len(partitioned_table_elements_5_chars[0].metadata.text_as_html) == 5
 								    # the first table element is under 200 chars so doesn't get chunked!
 								    assert table_elements[0] == partitioned_table_elements_200_chars[0]
 								    assert len(partitioned_table_elements_200_chars[0].text) < 200
-												feature(chunking): add basic strategy and overlap (#2367)

This PR culminates the restructuring of chunking over my prior
dozen-or-so commits by adding the new options to the API and
documentation.

Separately I'll be adding a new ingest test to defend against
regression, although the integration test included in this PR will do a
pretty good job of that too.
											
										
										
											2024-01-10 14:19:24 -08:00
+								    assert len(partitioned_table_elements_200_chars[1].text) == 198
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								    assert len(partitioned_table_elements_200_chars[1].metadata.text_as_html) == 200
 								def test_add_chunking_strategy_chars_on_partition_auto_adds_is_continuation():
 								    filename = "example-docs/example-10k-1p.html"
-												chore: adding max_characters to other element type chunking (#1673)

This PR adds the `max_characters` (hard max) param to non-table element
chunking. Additionally updates the `num_characters` metadata to
`max_characters` to make it clearer which param we're referencing.

To test:

```
from unstructured.partition.html import partition_html

filename = "example-docs/example-10k-1p.html"
chunk_elements = partition_html(
        filename,
        chunking_strategy="by_title",
        combine_text_under_n_chars=0,
        new_after_n_chars=50,
        max_characters=100,
    )

for chunk in chunk_elements:
     print(len(chunk.text))

# previously we were only respecting the "soft max" (default of 500) for elements other than tables
# now we should see that all the elements have text fields under 100 chars.
```

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-09 12:42:36 -07:00
+								    table_elements = [e for e in partition(filename) if isinstance(e, Table)]
 								    chunked_table_elements = [
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								        e
 								        for e in partition(
 								            filename,
-												chore: adding max_characters to other element type chunking (#1673)

This PR adds the `max_characters` (hard max) param to non-table element
chunking. Additionally updates the `num_characters` metadata to
`max_characters` to make it clearer which param we're referencing.

To test:

```
from unstructured.partition.html import partition_html

filename = "example-docs/example-10k-1p.html"
chunk_elements = partition_html(
        filename,
        chunking_strategy="by_title",
        combine_text_under_n_chars=0,
        new_after_n_chars=50,
        max_characters=100,
    )

for chunk in chunk_elements:
     print(len(chunk.text))

# previously we were only respecting the "soft max" (default of 500) for elements other than tables
# now we should see that all the elements have text fields under 100 chars.
```

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-09 12:42:36 -07:00
+								            chunking_strategy="by_title",
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								        )
 								        if isinstance(e, Table)
 								    ]
-												chore: adding max_characters to other element type chunking (#1673)

This PR adds the `max_characters` (hard max) param to non-table element
chunking. Additionally updates the `num_characters` metadata to
`max_characters` to make it clearer which param we're referencing.

To test:

```
from unstructured.partition.html import partition_html

filename = "example-docs/example-10k-1p.html"
chunk_elements = partition_html(
        filename,
        chunking_strategy="by_title",
        combine_text_under_n_chars=0,
        new_after_n_chars=50,
        max_characters=100,
    )

for chunk in chunk_elements:
     print(len(chunk.text))

# previously we were only respecting the "soft max" (default of 500) for elements other than tables
# now we should see that all the elements have text fields under 100 chars.
```

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-09 12:42:36 -07:00
+								    assert table_elements != chunked_table_elements
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								    i = 0
-												chore: adding max_characters to other element type chunking (#1673)

This PR adds the `max_characters` (hard max) param to non-table element
chunking. Additionally updates the `num_characters` metadata to
`max_characters` to make it clearer which param we're referencing.

To test:

```
from unstructured.partition.html import partition_html

filename = "example-docs/example-10k-1p.html"
chunk_elements = partition_html(
        filename,
        chunking_strategy="by_title",
        combine_text_under_n_chars=0,
        new_after_n_chars=50,
        max_characters=100,
    )

for chunk in chunk_elements:
     print(len(chunk.text))

# previously we were only respecting the "soft max" (default of 500) for elements other than tables
# now we should see that all the elements have text fields under 100 chars.
```

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-09 12:42:36 -07:00
+								    for table in chunked_table_elements:
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								        # have to reset the counter to 0 here when we encounter a Table element
 								        if isinstance(table, Table):
 								            i = 0
 								        if i > 0 and isinstance(table, TableChunk):
 								            assert table.metadata.is_continuation is True
 								            i += 1
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
 								EXAMPLE_LANG_DOCS = "example-docs/language-docs/eng_spa_mult."
 								@pytest.mark.parametrize(
 								    "file_extension",
 								    [
 								        "doc",
 								        "docx",
 								        "eml",
 								        "epub",
 								        "html",
 								        "md",
 								        "odt",
 								        "org",
 								        "ppt",
 								        "pptx",
 								        "rst",
 								        "rtf",
 								        "txt",
 								        "xml",
 								    ],
 								)
 								def test_partition_respects_language_arg(file_extension):
 								    filename = EXAMPLE_LANG_DOCS + file_extension
 								    elements = partition(filename=filename, languages=["deu"])
 								    assert all(element.metadata.languages == ["deu"] for element in elements)
 								def test_partition_respects_detect_language_per_element_arg():
 								    filename = "example-docs/language-docs/eng_spa_mult.txt"
 								    elements = partition(filename=filename, detect_language_per_element=True)
 								    langs = [element.metadata.languages for element in elements]
 								    assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
 								# check that the ["eng"] default in `partition` does not overwrite the ["auto"]
 								# default in other `partition_` functions.
 								def test_partition_default_does_not_overwrite_other_defaults():
 								    # the default for `languages` is ["auto"] in partiton_text
 								    from unstructured.partition.text import partition_text
 								    # Use a document that is primarily in a language other than English
 								    filename = "example-docs/language-docs/UDHR_first_article_all.txt"
 								    text_elements = partition_text(filename)
 								    assert text_elements[0].metadata.languages != ["eng"]
 								    auto_elements = partition(filename)
 								    assert auto_elements[0].metadata.languages != ["eng"]
 								    assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages
-												fix: default to None for the languages metadata field (#1743)

### Summary
Closes #1714
Changes the default value for `languages` to `None` for elements that
don't have text or the language can't be detected.

### Testing
```
from unstructured.partition.auto import partition
filename = "example-docs/handbook-1p.docx"
elements = partition(filename=filename, detect_language_per_element=True)

# PageBreak elements don't have text and will be collected here
none_langs = [element for element in elements if element.metadata.languages is None]
none_langs[0].text
```

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-14 17:46:24 -05:00
 								def test_partition_languages_default_to_None():
 								    filename = "example-docs/handbook-1p.docx"
 								    elements = partition(filename=filename, detect_language_per_element=True)
 								    # PageBreak and other elements with no text will have `None` for `languages`
 								    none_langs = [element for element in elements if element.metadata.languages is None]
 								    assert none_langs[0].text == ""
 								def test_partition_languages_incorrectly_defaults_to_English(tmpdir):
 								    # We don't totally rely on langdetect for short text, so text like the following that is
 								    # in German will be labeled as English.
 								    german = "Ein kurzer Satz."
 								    filepath = os.path.join(tmpdir, "short-german.txt")
 								    with open(filepath, "w") as f:
 								        f.write(german)
 								    elements = partition(filepath)
 								    assert elements[0].metadata.languages == ["eng"]
-												feat: enable request timeout (#2013)

Courtesy @cdpierse.

Adds a test to PR #1529 in accordance with feedback.

Description from original PR:

In python the default behaviour of `requests.get` without a `timeout`
being set is to hang indefinitely. We have a production use case where
the desired behaviour would be to raise a timeout error rather than have
the application just hang.

This PR adds a new optional keyword parameter `request_timeout` to
`partition` which is passed to `file_and_type_from_url` in the case
where we are fetching from a URL. This is then passed to `requests.get`

---------

Co-authored-by: Charles Pierse <charlespierse@gmail.com>
											
										
										
											2023-11-07 18:44:58 -06:00
 								def test_partition_timeout_gets_routed():
 								    class CallException(Exception):
 								        pass
 								    mock_ocr_func = Mock(side_effect=CallException("Function called!"))
 								    with patch("unstructured.partition.auto.file_and_type_from_url", mock_ocr_func), pytest.raises(
 								        CallException
 								    ):
 								        auto.partition(url="fake_url", request_timeout=326)
 								    kwargs = mock_ocr_func.call_args.kwargs
 								    assert "request_timeout" in kwargs
 								    assert kwargs["request_timeout"] == 326
-												enhancement: add support from bitmap images (#2414)

### Summary

Adds support for bitmap images (`.bmp`) in both file detection and
partitioning. Bitmap images will be processed with `partition_image`
just like JPGs and PNGs.

### Testing

```python
from unstructured.file_utils.filetype import detect_filetype
from unstructured.partition.auto import partition
from PIL import Image

filename = "example-docs/layout-parser-paper-with-table.jpg"
bmp_filename = "~/tmp/ayout-parser-paper-with-table.bmp"

img = Image.open(filename)
img.save(bmp_filename)

detect_filetype(filename=bmp_filename) # Should be FileType.BMP

elements = partition(filename=bmp_filename)
```
											
										
										
											2024-01-17 17:50:36 -05:00
 								def test_partition_image_with_bmp_with_auto(
 								    tmpdir,
 								    filename="example-docs/layout-parser-paper-with-table.jpg",
 								):
 								    bmp_filename = os.path.join(tmpdir.dirname, "example.bmp")
 								    img = Image.open(filename)
 								    img.save(bmp_filename)
 								    elements = partition(
 								        filename=bmp_filename,
 								        strategy=PartitionStrategy.HI_RES,
 								    )
 								    table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
 								    assert len(table) == 1
-												chore: bump unstructured-inference 0.7.35 (#3205)

### Summary
- bump unstructured-inference to `0.7.35` which fixed syntax for
generated HTML tables
- update unit tests and ingest test fixtures to reflect changes in the
generated HTML tables
- cut a release for `0.14.6`

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-06-14 11:11:38 -07:00
+								    assert "<table><thead><tr>" in table[0]
 								    assert "</thead><tbody><tr>" in table[0]
-												enhancement: process `.p7s` files with `partition_email` (#2521)

### Summary

Closes #2489, which reported an inability to process `.p7s` files. PR
implements two changes:

- If the user selected content type for the email is not available and
there is another valid content type available, fall back to the other
valid content type.
- For signed message, extract the signature and add it to the metadata


### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/eml/signed-doc.p7s"
elements = partition(filename=filename) # should get a message about fall back logic
print(elements[0]) # "This is a test"
elements[0].metadata.to_dict() # Will see the signature
```
											
										
										
											2024-02-07 17:31:49 -05:00
 								def test_auto_partition_eml_add_signature_to_metadata():
 								    elements = partition(filename="example-docs/eml/signed-doc.p7s")
 								    assert len(elements) == 1
 								    assert elements[0].text == "This is a test"
 								    assert elements[0].metadata.signature == "<SIGNATURE>\n"