unstructured/test_unstructured/partition/test_auto.py

# pyright: reportPrivateUsage=false

from __future__ import annotations

import json
import os
import pathlib
import sys
import tempfile
import warnings
from importlib import import_module
from typing import Callable, Iterator, cast
from unittest.mock import Mock, patch

import docx
import pytest
from docx.document import Document
from PIL import Image

from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
from test_unstructured.partition.test_constants import (
    EXPECTED_TABLE,
    EXPECTED_TABLE_XLSX,
    EXPECTED_TEXT,
    EXPECTED_TEXT_XLSX,
    EXPECTED_TITLE,
)
from test_unstructured.unit_utils import (
    ANY,
    FixtureRequest,
    LogCaptureFixture,
    MonkeyPatch,
    example_doc_path,
    function_mock,
    method_mock,
)
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import (
    Address,
    Element,
    ElementMetadata,
    ListItem,
    NarrativeText,
    Table,
    TableChunk,
    Text,
    Title,
)
from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType
from unstructured.partition import auto
from unstructured.partition.auto import _get_partition_with_extras, partition
from unstructured.partition.common import convert_office_doc
from unstructured.partition.utils.constants import PartitionStrategy
from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json

is_in_docker = os.path.exists("/.dockerenv")


# ================================================================================================
# CSV
# ================================================================================================


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_csv_from_filename():
    elements = partition(example_doc_path("stanley-cups.csv"))

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == "text/csv"


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_csv_from_file():
    with open(example_doc_path("stanley-cups.csv"), "rb") as f:
        elements = partition(file=f)

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert isinstance(elements[0], Table)
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == "text/csv"


# ================================================================================================
# DOC
# ================================================================================================


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
)
def test_auto_partition_doc_with_filename(
    mock_docx_document: Document,
    expected_docx_elements: list[Element],
    tmp_path: pathlib.Path,
    pass_metadata_filename: bool,
    content_type: str | None,
):
    docx_file_path = str(tmp_path / "mock_document.docx")
    doc_file_path = str(tmp_path / "mock_document.doc")
    mock_docx_document.save(docx_file_path)
    convert_office_doc(docx_file_path, str(tmp_path), "doc")
    metadata_filename = doc_file_path if pass_metadata_filename else None
    elements = partition(
        filename=doc_file_path,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.HI_RES,
    )
    assert elements == expected_docx_elements
    assert elements[0].metadata.filename == "mock_document.doc"
    assert elements[0].metadata.file_directory == str(tmp_path)


@pytest.mark.skipif(is_in_docker, reason="Passes in CI but not Docker. Remove skip on #3364 fix.")
@pytest.mark.xfail(sys.platform == "darwin", reason="#3364", raises=KeyError, strict=True)
def test_auto_partition_doc_with_file():
    # -- NOTE(scanny): https://github.com/Unstructured-IO/unstructured/issues/3364
    # -- detect_filetype() identifies .doc as `application/x-ole-storage` which is true but not
    # -- specific enough. The `FileType.MSG` file-type is assigned (which is also an OLE file)
    # -- and `partition()` routes the document to `partition_msg` which is where the `KeyError`
    # -- comes from.
    # -- For some reason, this xfail problem only occurs locally, not in CI, possibly because we
    # -- use two different `libmagic` sourcs (`libmagic` on CI and `libmagic1` on Mac). Doesn't
    # -- matter much though because when we add disambiguation they'll both get it right.
    with open(example_doc_path("simple.doc"), "rb") as f:
        elements = partition(file=f)

    assert elements == [
        Title("These are a few of my favorite things:"),
        ListItem("Parrots"),
        ListItem("Hockey"),
        Title("Analysis"),
        NarrativeText("This is my first thought. This is my second thought."),
        NarrativeText("This is my third thought."),
        Text("2023"),
        Address("DOYLESTOWN, PA 18901"),
    ]


# ================================================================================================
# DOCX
# ================================================================================================


@pytest.fixture()
def mock_docx_document():
    document = docx.Document()

    document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
    # NOTE(robinson) - this should get picked up as a list item due to the •
    document.add_paragraph("• Parrots", style="Normal")
    document.add_paragraph("Hockey", style="List Bullet")
    # NOTE(robinson) - this should get picked up as a title
    document.add_paragraph("Analysis", style="Normal")
    # NOTE(robinson) - this should get dropped because it is empty
    document.add_paragraph("", style="Normal")
    # NOTE(robinson) - this should get picked up as a narrative text
    document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
    document.add_paragraph("This is my third thought.", style="Body Text")
    # NOTE(robinson) - this should just be regular text
    document.add_paragraph("2023")

    return document


@pytest.fixture()
def expected_docx_elements():
    return [
        Title("These are a few of my favorite things:"),
        ListItem("Parrots"),
        ListItem("Hockey"),
        Title("Analysis"),
        NarrativeText("This is my first thought. This is my second thought."),
        NarrativeText("This is my third thought."),
        Text("2023"),
    ]


def test_auto_partition_docx_with_filename(
    mock_docx_document: Document, expected_docx_elements: list[Element], tmp_path: pathlib.Path
):
    file_path = str(tmp_path / "mock_document.docx")
    mock_docx_document.save(file_path)

    elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
    assert elements == expected_docx_elements
    assert elements[0].metadata.filename == os.path.basename(file_path)


def test_auto_partition_docx_with_file(
    mock_docx_document: Document, expected_docx_elements: list[Element], tmp_path: pathlib.Path
):
    file_path = str(tmp_path / "mock_document.docx")
    mock_docx_document.save(file_path)

    with open(file_path, "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert elements == expected_docx_elements


@pytest.mark.parametrize("file_name", ["simple.docx", "simple.doc", "simple.odt"])
@pytest.mark.parametrize(
    "strategy",
    [
        PartitionStrategy.AUTO,
        PartitionStrategy.FAST,
        PartitionStrategy.HI_RES,
        PartitionStrategy.OCR_ONLY,
    ],
)
def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers(
    request: FixtureRequest, file_name: str, strategy: str
):
    """The `strategy` arg value received by `partition()` is received by `partition_docx().

    To do this in the brokering-partitioner cases (DOC, ODT) it must make its way to
    `partition_doc()` or `partition_odt()` which must then forward it to `partition_docx()`. This
    test makes sure it made it all the way.

    Note this is 3 file-types X 4 strategies = 12 test-cases.
    """
    from unstructured.partition.docx import _DocxPartitioner

    def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
        yield Text(f"strategy=={self._opts.strategy}")

    _iter_elements_ = method_mock(
        request,
        _DocxPartitioner,
        "_iter_document_elements",
        side_effect=fake_iter_document_elements,
    )

    (element,) = partition(example_doc_path(file_name), strategy=strategy)

    _iter_elements_.assert_called_once_with(ANY)
    assert element.text == f"strategy=={strategy}"


# ================================================================================================
# EML
# ================================================================================================

EXPECTED_EMAIL_OUTPUT = [
    NarrativeText(text="This is a test email to use for unit tests."),
    Title(text="Important points:"),
    ListItem(text="Roses are red"),
    ListItem(text="Violets are blue"),
]


def test_auto_partition_email_from_filename():
    file_path = example_doc_path("eml/fake-email.eml")
    elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements == EXPECTED_EMAIL_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(file_path)
    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]


def test_auto_partition_email_from_file():
    with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements == EXPECTED_EMAIL_OUTPUT


def test_auto_partition_eml_add_signature_to_metadata():
    elements = partition(example_doc_path("eml/signed-doc.p7s"))
    assert len(elements) == 1
    assert elements[0].text == "This is a test"
    assert elements[0].metadata.signature == "<SIGNATURE>\n"


# ================================================================================================
# EPUB
# ================================================================================================


def test_auto_partition_epub_from_filename():
    elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")


def test_auto_partition_epub_from_file():
    with open(example_doc_path("winter-sports.epub"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")


# ================================================================================================
# HTML
# ================================================================================================


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content_type: str | None):
    file_path = example_doc_path("example-10k.html")
    metadata_filename = file_path if pass_metadata_filename else None
    elements = partition(
        filename=file_path,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.HI_RES,
    )
    assert len(elements) > 0
    assert elements[0].metadata.filename == os.path.basename(file_path)
    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_type: str | None):
    file_path = example_doc_path("fake-html.html")
    metadata_filename = file_path if pass_metadata_filename else None
    with open(file_path, "rb") as f:
        elements = partition(
            file=f,
            metadata_filename=metadata_filename,
            content_type=content_type,
            strategy=PartitionStrategy.HI_RES,
        )
    assert len(elements) > 0


def test_auto_partition_html_from_file_rb():
    with open(example_doc_path("fake-html.html"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0


def test_auto_partition_html_pre_from_file():
    elements = partition(example_doc_path("fake-html-pre.htm"))

    assert len(elements) > 0
    assert "PageBreak" not in [elem.category for elem in elements]
    assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
    assert isinstance(elements[0], NarrativeText)
    assert elements[0].metadata.filetype == "text/html"
    assert elements[0].metadata.filename == "fake-html-pre.htm"


# ================================================================================================
# IMAGE
# ================================================================================================


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_image(pass_metadata_filename: bool, content_type: str | None):
    file_path = example_doc_path("layout-parser-paper-fast.jpg")
    metadata_filename = file_path if pass_metadata_filename else None
    elements = partition(
        filename=file_path,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.AUTO,
    )

    # should be same result as test_partition_image_default_strategy_hi_res() in test_image.py
    title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
    idx = 2
    assert elements[idx].text == title
    assert elements[idx].metadata.coordinates is not None


@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_auto_partition_image_element_extraction(extract_image_block_to_payload: bool):
    extract_image_block_types = ["Image", "Table"]

    with tempfile.TemporaryDirectory() as tmpdir:
        elements = partition(
            filename=example_doc_path("embedded-images-tables.jpg"),
            extract_image_block_types=extract_image_block_types,
            extract_image_block_to_payload=extract_image_block_to_payload,
            extract_image_block_output_dir=tmpdir,
        )

        assert_element_extraction(
            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
        )


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpg(pass_metadata_filename: bool, content_type: str | None):
    file_path = example_doc_path("layout-parser-paper-fast.jpg")
    metadata_filename = file_path if pass_metadata_filename else None
    elements = partition(
        filename=file_path,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.AUTO,
    )
    assert len(elements) > 0


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpg_from_file(pass_metadata_filename: bool, content_type: str | None):
    file_path = example_doc_path("layout-parser-paper-fast.jpg")
    metadata_filename = file_path if pass_metadata_filename else None
    with open(file_path, "rb") as f:
        elements = partition(
            file=f,
            metadata_filename=metadata_filename,
            content_type=content_type,
            strategy=PartitionStrategy.AUTO,
        )
    assert len(elements) > 0


def test_partition_image_with_bmp_with_auto(tmp_path: pathlib.Path):
    bmp_filename = str(tmp_path / "example.bmp")
    with Image.open(example_doc_path("layout-parser-paper-with-table.jpg")) as img:
        img.save(bmp_filename)

    elements = partition(
        filename=bmp_filename,
        strategy=PartitionStrategy.HI_RES,
    )

    table = [e.metadata.text_as_html for e in elements if e.metadata.text_as_html]
    assert len(table) == 1
    assert "<table><thead><tr>" in table[0]
    assert "</thead><tbody><tr>" in table[0]


# ================================================================================================
# JSON
# ================================================================================================


def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements():
    """Test auto-processing an unstructured json output file by filename."""
    json_file_path = example_doc_path("spring-weather.html.json")
    original_file_name = "spring-weather.html"
    with open(json_file_path) as json_f:
        expected_result = json.load(json_f)

    partitioning_result = json.loads(
        cast(
            str,
            elements_to_json(
                partition(
                    filename=str(json_file_path),
                    # -- use the original file name to get the same element IDs (hashes) --
                    metadata_filename=original_file_name,
                    strategy=PartitionStrategy.HI_RES,
                )
            ),
        )
    )
    for elem in partitioning_result:
        elem.pop("metadata")
    for elem in expected_result:
        elem.pop("metadata")
    assert expected_result == partitioning_result


def test_auto_partition_json_raises_with_unprocessable_json(tmp_path: pathlib.Path):
    # NOTE(robinson) - This is unprocessable because it is not a list of dicts,
    # per the Unstructured ISD format
    text = '{"hi": "there"}'

    file_path = str(tmp_path / "unprocessable.json")
    with open(file_path, "w") as f:
        f.write(text)

    with pytest.raises(ValueError):
        partition(filename=file_path)


@pytest.mark.xfail(
    reason=(
        "https://github.com/Unstructured-IO/unstructured/issues/3365"
        " partition_json() does not preserve original element-id or metadata"
    ),
    raises=AssertionError,
    strict=True,
)
def test_auto_partition_json_from_file_preserves_original_elements():
    file_path = example_doc_path("simple.json")
    original_elements = elements_from_json(file_path)

    with open(file_path, "rb") as f:
        partitioned_elements = partition(file=f)

    assert elements_to_dicts(partitioned_elements) == elements_to_dicts(original_elements)


def test_auto_partition_works_with_unstructured_jsons():
    elements = partition(
        example_doc_path("spring-weather.html.json"), strategy=PartitionStrategy.HI_RES
    )
    assert elements[0].text == "News Around NOAA"


def test_auto_partition_works_with_unstructured_jsons_from_file():
    with open(example_doc_path("spring-weather.html.json"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert elements[0].text == "News Around NOAA"


# ================================================================================================
# MD
# ================================================================================================


def test_partition_md_works_with_embedded_html():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
    elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)
    assert "unstructured" in elements[0].text


# ================================================================================================
# MSG
# ================================================================================================


EXPECTED_MSG_OUTPUT = [
    NarrativeText(text="This is a test email to use for unit tests."),
    Title(text="Important points:"),
    ListItem(text="Roses are red"),
    ListItem(text="Violets are blue"),
]


def test_auto_partition_msg_from_filename():
    elements = partition(example_doc_path("fake-email.msg"), strategy=PartitionStrategy.HI_RES)
    assert elements == EXPECTED_MSG_OUTPUT


# ================================================================================================
# ODT
# ================================================================================================


def test_auto_partition_odt_from_filename():
    elements = partition(example_doc_path("fake.odt"), strategy=PartitionStrategy.HI_RES)
    assert elements[0] == Title("Lorem ipsum dolor sit amet.")


def test_auto_partition_odt_from_file():
    with open(example_doc_path("fake.odt"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)

    assert elements[0] == Title("Lorem ipsum dolor sit amet.")


# ================================================================================================
# ORG
# ================================================================================================


def test_auto_partition_org_from_filename():
    elements = partition(example_doc_path("README.org"))

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/org"


def test_auto_partition_org_from_file():
    with open(example_doc_path("README.org"), "rb") as f:
        elements = partition(file=f, content_type="text/org")

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/org"


# ================================================================================================
# PDF
# ================================================================================================


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
    file_path = example_doc_path("layout-parser-paper-fast.pdf")
    metadata_filename = file_path if pass_metadata_filename else None

    elements = partition(
        filename=file_path,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.HI_RES,
    )

    # NOTE(scanny): gave up trying to figure out why, but this file partitions differently locally
    # (on Mac) than it does in CI. Basically the first element when partitioning locally is split
    # in two when partitioning on CI. Other than that split the text is exactly the same.
    idx = 2 if sys.platform == "darwin" else 3

    e = elements[idx]
    assert isinstance(e, Title)
    assert e.text.startswith("LayoutParser")
    assert e.metadata.filename == os.path.basename(file_path)
    assert e.metadata.file_directory == os.path.split(file_path)[0]

    e = elements[idx + 1]
    assert isinstance(e, NarrativeText)
    assert e.text.startswith("Zejiang Shen")


def test_auto_partition_pdf_uses_table_extraction():
    with patch(
        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
    ) as mock_process_file_with_model:
        partition(
            example_doc_path("layout-parser-paper-fast.pdf"),
            pdf_infer_table_structure=True,
            strategy=PartitionStrategy.HI_RES,
        )
        assert mock_process_file_with_model.call_args[1]["infer_table_structure"]


def test_auto_partition_pdf_with_fast_strategy(monkeypatch: MonkeyPatch):
    file_path = example_doc_path("layout-parser-paper-fast.pdf")

    mock_return = [NarrativeText("Hello there!")]
    with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
        mock_partition_with_extras_map = {"pdf": mock_partition}
        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
        partition(filename=file_path, strategy=PartitionStrategy.FAST)

    mock_partition.assert_called_once_with(
        filename=file_path,
        file=None,
        url=None,
        strategy=PartitionStrategy.FAST,
        languages=None,
        metadata_filename=None,
        include_page_breaks=False,
        infer_table_structure=False,
        extract_images_in_pdf=False,
        extract_image_block_types=None,
        extract_image_block_output_dir=None,
        extract_image_block_to_payload=False,
        hi_res_model_name=None,
        date_from_file_object=False,
        starting_page_number=1,
    )


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
    file_path = example_doc_path("layout-parser-paper-fast.pdf")
    metadata_filename = file_path if pass_metadata_filename else None

    with open(file_path, "rb") as f:
        elements = partition(
            file=f,
            metadata_filename=metadata_filename,
            content_type=content_type,
            strategy=PartitionStrategy.HI_RES,
        )

    # NOTE(scanny): see "with_filename" version of this test above for more on this oddness
    idx = 2 if sys.platform == "darwin" else 3

    e = elements[idx]
    assert isinstance(e, Title)
    assert e.text.startswith("LayoutParser")

    e = elements[idx + 1]
    assert isinstance(e, NarrativeText)
    assert e.text.startswith("Zejiang Shen")


def test_partition_pdf_does_not_raise_warning():
    # NOTE(robinson): This is the recommended way to check that no warning is emitted,
    # per the pytest docs.
    # ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
    #      #additional-use-cases-of-warnings-in-tests
    with warnings.catch_warnings():
        warnings.simplefilter("error")
        partition(
            example_doc_path("layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES
        )


@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: bool):
    extract_image_block_types = ["Image", "Table"]

    with tempfile.TemporaryDirectory() as tmpdir:
        elements = partition(
            example_doc_path("embedded-images-tables.pdf"),
            extract_image_block_types=extract_image_block_types,
            extract_image_block_to_payload=extract_image_block_to_payload,
            extract_image_block_output_dir=tmpdir,
        )

        assert_element_extraction(
            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
        )


# ================================================================================================
# PPT
# ================================================================================================


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_ppt_from_filename():
    file_path = example_doc_path("fake-power-point.ppt")
    elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
    assert elements == EXPECTED_PPTX_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(file_path)
    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]


# ================================================================================================
# PPTX
# ================================================================================================


EXPECTED_PPTX_OUTPUT = [
    Title(text="Adding a Bullet Slide"),
    ListItem(text="Find the bullet slide layout"),
    ListItem(text="Use _TextFrame.text for first bullet"),
    ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
    NarrativeText(text="Here is a lot of text!"),
    NarrativeText(text="Here is some text in a text box!"),
]


def test_auto_partition_pptx_from_filename():
    file_path = example_doc_path("fake-power-point.pptx")
    elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
    assert elements == EXPECTED_PPTX_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(file_path)
    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]


@pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"])
@pytest.mark.parametrize(
    "strategy",
    [
        PartitionStrategy.AUTO,
        PartitionStrategy.FAST,
        PartitionStrategy.HI_RES,
        PartitionStrategy.OCR_ONLY,
    ],
)
def test_partition_forwards_strategy_arg_to_partition_pptx_and_its_brokers(
    request: FixtureRequest, file_name: str, strategy: str
):
    """The `strategy` arg value received by `partition()` is received by `partition_pptx().

    To do this in the brokering-partitioner case (PPT) the strategy argument must make its way to
    `partition_ppt()` which must then forward it to `partition_pptx()`. This test makes sure it
    made it all the way.

    Note this is 2 file-types X 4 strategies = 8 test-cases.
    """
    from unstructured.partition.pptx import _PptxPartitioner

    def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
        yield Text(f"strategy=={self._opts.strategy}")

    _iter_elements_ = method_mock(
        request,
        _PptxPartitioner,
        "_iter_presentation_elements",
        side_effect=fake_iter_presentation_elements,
    )

    (element,) = partition(example_doc_path(file_name), strategy=strategy)

    _iter_elements_.assert_called_once_with(ANY)
    assert element.text == f"strategy=={strategy}"


# ================================================================================================
# RST
# ================================================================================================


def test_auto_partition_rst_from_filename():
    elements = partition(example_doc_path("README.rst"))

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/x-rst"


def test_auto_partition_rst_from_file():
    with open(example_doc_path("README.rst"), "rb") as f:
        elements = partition(file=f, content_type="text/x-rst")

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/x-rst"


# ================================================================================================
# RTF
# ================================================================================================


def test_auto_partition_rtf_from_filename():
    elements = partition(example_doc_path("fake-doc.rtf"), strategy=PartitionStrategy.HI_RES)
    assert elements[0] == Title("My First Heading")


# ================================================================================================
# TSV
# ================================================================================================


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_tsv_from_filename():
    elements = partition(example_doc_path("stanley-cups.tsv"))

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == "text/tsv"


# ================================================================================================
# TXT
# ================================================================================================


EXPECTED_TEXT_OUTPUT = [
    NarrativeText(text="This is a test document to use for unit tests."),
    Address(text="Doylestown, PA 18901"),
    Title(text="Important points:"),
    ListItem(text="Hamburgers are delicious"),
    ListItem(text="Dogs are the best"),
    ListItem(text="I love fuzzy blankets"),
]


def test_auto_partition_text_from_filename():
    file_path = example_doc_path("fake-text.txt")
    elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements == EXPECTED_TEXT_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(file_path)
    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]


def test_auto_partition_text_from_file():
    with open(example_doc_path("fake-text.txt"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements == EXPECTED_TEXT_OUTPUT


# ================================================================================================
# XLS
# ================================================================================================

EXPECTED_XLS_TEXT_LEN = 550

EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MC What is 2+2? 4 correct 3 incorrect MA What"

EXPECTED_XLS_TABLE = (
    """<table border="1" class="dataframe">
  <tbody>
    <tr>
      <td>MC</td>
      <td>What is 2+2?</td>
      <td>4</td>
      <td>correct</td>
      <td>3</td>
      <td>incorrect</td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>MA</td>
      <td>What C datatypes are 8 bits? (assume i386)</td>
      <td>int</td>
      <td></td>
      <td>float</td>
      <td></td>
      <td>double</td>
      <td></td>
      <td>char</td>
    </tr>
    <tr>
      <td>TF</td>
      <td>Bagpipes are awesome.</td>
      <td>true</td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>ESS</td>
      <td>How have the original Henry Hornbostel buildings """
    """influenced campus architecture and design in the last 30 years?</td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>ORD</td>
      <td>Rank the following in their order of operation.</td>
      <td>Parentheses</td>
      <td>Exponents</td>
      <td>Division</td>
      <td>Addition</td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>FIB</td>
      <td>The student activities fee is</td>
      <td>95</td>
      <td>dollars for students enrolled in</td>
      <td>19</td>
      <td>units or more,</td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>MAT</td>
      <td>Match the lower-case greek letter with its capital form.</td>
      <td>λ</td>
      <td>Λ</td>
      <td>α</td>
      <td>γ</td>
      <td>Γ</td>
      <td>φ</td>
      <td>Φ</td>
    </tr>
  </tbody>
</table>"""
)


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_xls_from_filename():
    elements = partition(
        example_doc_path("tests-example.xls"), include_header=False, skip_infer_table_types=[]
    )

    assert sum(isinstance(element, Table) for element in elements) == 2
    assert len(elements) == 14

    assert clean_extra_whitespace(elements[0].text)[:45] == EXPECTED_XLS_INITIAL_45_CLEAN_TEXT
    # NOTE(crag): if the beautifulsoup4 package is installed, some (but not all) additional
    # whitespace is removed, so the expected text length is less than is the case
    # when beautifulsoup4 is *not* installed. E.g.
    # "\n\n\nMA\nWhat C datatypes are 8 bits" vs.
    # '\n  \n    \n      MA\n      What C datatypes are 8 bits?... "
    assert len(elements[0].text) == EXPECTED_XLS_TEXT_LEN
    assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE


# ================================================================================================
# XLSX
# ================================================================================================


EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"


def test_auto_partition_xlsx_from_filename():
    elements = partition(
        example_doc_path("stanley-cups.xlsx"), include_header=False, skip_infer_table_types=[]
    )

    assert sum(isinstance(element, Table) for element in elements) == 2
    assert sum(isinstance(element, Title) for element in elements) == 2
    assert len(elements) == 4

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
    assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
    assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
    assert elements[1].metadata.page_number == 1
    assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE


def test_auto_partition_xlsx_from_file():
    with open(example_doc_path("stanley-cups.xlsx"), "rb") as f:
        elements = partition(file=f, include_header=False, skip_infer_table_types=[])

    assert sum(isinstance(element, Table) for element in elements) == 2
    assert sum(isinstance(element, Title) for element in elements) == 2
    assert len(elements) == 4

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
    assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
    assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
    assert elements[1].metadata.page_number == 1
    assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE


def test_auto_partition_respects_starting_page_number_argument_for_xlsx():
    elements = partition(example_doc_path("stanley-cups.xlsx"), starting_page_number=3)
    assert elements[1].metadata.page_number == 3


# ================================================================================================
# XML
# ================================================================================================


def test_auto_partition_xml_from_filename():
    file_path = example_doc_path("factbook.xml")

    elements = partition(file_path, xml_keep_tags=False, metadata_filename=file_path)

    assert elements[0].text == "United States"
    assert elements[0].metadata.filename == "factbook.xml"


def test_auto_partition_xml_from_file():
    with open(example_doc_path("factbook.xml"), "rb") as f:
        elements = partition(file=f, xml_keep_tags=False)

    assert elements[0].text == "United States"


def test_auto_partition_xml_from_filename_with_tags():
    elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=True)

    assert "<leader>Joe Biden</leader>" in elements[0].text
    assert elements[0].metadata.filename == "factbook.xml"


def test_auto_partition_xml_from_file_with_tags():
    with open(example_doc_path("factbook.xml"), "rb") as f:
        elements = partition(file=f, xml_keep_tags=True)

    assert "<leader>Joe Biden</leader>" in elements[0].text


# ================================================================================================
# FILE_TYPE NOT RECOGNIZED OR NOT SUPPORTED
# ================================================================================================


def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
    detect_filetype_ = function_mock(
        request, "unstructured.partition.auto.detect_filetype", return_value=None
    )

    with pytest.raises(ValueError):
        partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)

    detect_filetype_.assert_called_once_with(
        content_type=None, encoding=None, file=None, file_filename=None, filename="made-up.fake"
    )


# ================================================================================================
# LOAD FROM URL
# ================================================================================================


def test_auto_partition_from_url():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
    elements = partition(url=url, content_type="text/plain", strategy=PartitionStrategy.HI_RES)
    assert elements[0] == Title("Apache License")
    assert elements[0].metadata.url == url


def test_auto_partition_from_url_with_rfc9110_content_type():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
    elements = partition(
        url=url, content_type="text/plain; charset=utf-8", strategy=PartitionStrategy.HI_RES
    )
    assert elements[0] == Title("Apache License")
    assert elements[0].metadata.url == url


def test_auto_partition_from_url_without_providing_content_type():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
    elements = partition(url=url, strategy=PartitionStrategy.HI_RES)
    assert elements[0] == Title("Apache License")
    assert elements[0].metadata.url == url


def test_auto_partition_warns_if_header_set_and_not_url(caplog: LogCaptureFixture):
    partition(
        example_doc_path("eml/fake-email.eml"),
        headers={"Accept": "application/pdf"},
        strategy=PartitionStrategy.HI_RES,
    )
    assert caplog.records[0].levelname == "WARNING"


def test_partition_timeout_gets_routed():
    class CallException(Exception):
        pass

    mock_ocr_func = Mock(side_effect=CallException("Function called!"))
    with patch("unstructured.partition.auto.file_and_type_from_url", mock_ocr_func), pytest.raises(
        CallException
    ):
        auto.partition(url="fake_url", request_timeout=326)
    kwargs = mock_ocr_func.call_args.kwargs
    assert "request_timeout" in kwargs
    assert kwargs["request_timeout"] == 326


# ================================================================================================
# OTHER ARGS
# ================================================================================================

# -- chunking_strategy ----------------------------------------------------


def test_add_chunking_strategy_on_partition_auto():
    file_path = example_doc_path("example-10k-1p.html")
    elements = partition(file_path)
    chunk_elements = partition(file_path, chunking_strategy="by_title")
    chunks = chunk_by_title(elements)
    assert chunk_elements != elements
    assert chunk_elements == chunks


def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
    file_path = example_doc_path("example-10k-1p.html")

    # default chunk size in chars is 200
    partitioned_table_elements_200_chars = [
        e
        for e in partition(
            file_path,
            chunking_strategy="by_title",
            max_characters=200,
            combine_text_under_n_chars=5,
        )
        if isinstance(e, (Table, TableChunk))
    ]

    partitioned_table_elements_5_chars = [
        e
        for e in partition(
            file_path,
            chunking_strategy="by_title",
            max_characters=5,
            combine_text_under_n_chars=5,
        )
        if isinstance(e, (Table, TableChunk))
    ]

    elements = partition(file_path)

    table_elements = [e for e in elements if isinstance(e, Table)]

    assert len(partitioned_table_elements_5_chars) != len(table_elements)
    assert len(partitioned_table_elements_200_chars) != len(table_elements)

    # trailing whitespace is stripped from the first chunk, leaving only a checkbox character
    assert len(partitioned_table_elements_5_chars[0].text) == 1
    # but the second chunk is the full 5 characters
    assert len(partitioned_table_elements_5_chars[1].text) == 5
    assert len(cast(str, partitioned_table_elements_5_chars[0].metadata.text_as_html)) == 5

    # the first table element is under 200 chars so doesn't get chunked!
    assert table_elements[0] == partitioned_table_elements_200_chars[0]
    assert len(partitioned_table_elements_200_chars[0].text) < 200
    assert len(partitioned_table_elements_200_chars[1].text) == 198
    assert len(cast(str, partitioned_table_elements_200_chars[1].metadata.text_as_html)) == 200


def test_add_chunking_strategy_chars_on_partition_auto_adds_is_continuation():
    file_path = example_doc_path("example-10k-1p.html")

    table_elements = [e for e in partition(file_path) if isinstance(e, Table)]
    table_chunks = [
        e
        for e in partition(file_path, chunking_strategy="by_title")
        if isinstance(e, (Table, TableChunk))
    ]

    assert table_elements != table_chunks

    i = 0
    for chunk in table_chunks:
        # have to reset the counter to 0 here when we encounter a Table element
        if not isinstance(chunk, TableChunk):
            i = 0
        if i > 0 and isinstance(chunk, TableChunk):
            assert chunk.metadata.is_continuation is True
            i += 1


# -- detect_language_per_element ------------------------------------------


def test_partition_respects_detect_language_per_element_arg():
    elements = partition(
        example_doc_path("language-docs/eng_spa_mult.txt"), detect_language_per_element=True
    )
    langs = [element.metadata.languages for element in elements]
    assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]


# -- languages ------------------------------------------------------------


@pytest.mark.parametrize(
    "file_extension",
    [
        "doc",
        "docx",
        "eml",
        "epub",
        "html",
        "md",
        "odt",
        "org",
        "ppt",
        "pptx",
        "rst",
        "rtf",
        "txt",
        "xml",
    ],
)
def test_partition_respects_language_arg(file_extension: str):
    elements = partition(
        example_doc_path(f"language-docs/eng_spa_mult.{file_extension}"), languages=["deu"]
    )
    assert all(element.metadata.languages == ["deu"] for element in elements)


# -- include_page_breaks --------------------------------------------------


def test_auto_with_page_breaks():
    elements = partition(
        example_doc_path("layout-parser-paper-fast.pdf"),
        include_page_breaks=True,
        strategy=PartitionStrategy.HI_RES,
    )
    assert "PageBreak" in [elem.category for elem in elements]


# -- metadata_filename ----------------------------------------------------


def test_auto_partition_metadata_filename():
    file_path = example_doc_path("fake-text.txt")
    with open(file_path, "rb") as f:
        elements = partition(file=f, metadata_filename=file_path)
    assert elements[0].metadata.filename == os.path.split(file_path)[-1]


def test_auto_partition_warns_about_file_filename_deprecation(caplog: LogCaptureFixture):
    file_path = example_doc_path("fake-text.txt")
    with open(file_path, "rb") as f:
        elements = partition(file=f, file_filename=file_path)
    assert elements[0].metadata.filename == os.path.split(file_path)[-1]
    assert "WARNING" in caplog.text
    assert "The file_filename kwarg will be deprecated" in caplog.text


def test_auto_partition_raises_with_file_and_metadata_filename():
    file_path = example_doc_path("fake-text.txt")
    with open(file_path, "rb") as f, pytest.raises(ValueError):
        partition(file=f, file_filename=file_path, metadata_filename=file_path)


# -- ocr_languages --------------------------------------------------------


def test_auto_partition_formats_languages_for_tesseract():
    with patch(
        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
    ) as mock_process_file_with_ocr:
        partition(
            example_doc_path("chi_sim_image.jpeg"),
            strategy=PartitionStrategy.HI_RES,
            languages=["zh"],
        )
        _, kwargs = mock_process_file_with_ocr.call_args_list[0]
        assert "ocr_languages" in kwargs
        assert kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"


@pytest.mark.parametrize(("languages", "ocr_languages"), [(["auto"], ""), (["eng"], "")])
def test_auto_partition_ignores_empty_string_for_ocr_languages(
    languages: list[str], ocr_languages: str
):
    elements = partition(
        example_doc_path("book-war-and-peace-1p.txt"),
        strategy=PartitionStrategy.OCR_ONLY,
        ocr_languages=ocr_languages,
        languages=languages,
    )
    assert elements[0].metadata.languages == ["eng"]


def test_auto_partition_warns_with_ocr_languages(caplog: LogCaptureFixture):
    partition(
        example_doc_path("chevron-page.pdf"), strategy=PartitionStrategy.HI_RES, ocr_languages="eng"
    )
    assert "The ocr_languages kwarg will be deprecated" in caplog.text


# -- skip_infer_table_types -----------------------------------------------


@pytest.mark.parametrize(
    ("skip_infer_table_types", "filename", "has_text_as_html_field"),
    [
        (["xlsx"], "stanley-cups.xlsx", False),
        ([], "stanley-cups.xlsx", True),
        (["odt"], "fake.odt", False),
        ([], "fake.odt", True),
    ],
)
def test_auto_partition_respects_skip_infer_table_types(
    skip_infer_table_types: list[str], filename: str, has_text_as_html_field: bool
):
    with open(example_doc_path(filename), "rb") as f:
        table_elements = [
            e
            for e in partition(file=f, skip_infer_table_types=skip_infer_table_types)
            if isinstance(e, Table)
        ]
        for table_element in table_elements:
            table_element_has_text_as_html_field = (
                hasattr(table_element.metadata, "text_as_html")
                and table_element.metadata.text_as_html is not None
            )
            assert table_element_has_text_as_html_field == has_text_as_html_field


# ================================================================================================
# METADATA BEHAVIORS
# ================================================================================================

# -- .filetype ------------------------------------------------------------

supported_filetypes = [t for t in FileType if t not in (FileType.UNK, FileType.ZIP, FileType.XLS)]

FILETYPE_TO_MODULE = {
    FileType.JPG: "image",
    FileType.PNG: "image",
    FileType.HEIC: "image",
    FileType.TXT: "text",
    FileType.EML: "email",
}


@pytest.mark.parametrize(
    ("content_type", "routing_func", "expected"),
    [
        ("text/csv", "csv", "text/csv"),
        ("text/html", "html", "text/html"),
        ("jdsfjdfsjkds", "pdf", None),
    ],
)
def test_auto_adds_filetype_to_metadata(
    request: FixtureRequest,
    content_type: str,
    routing_func: str,
    expected: str | None,
    monkeypatch: MonkeyPatch,
):
    partition_fn_ = function_mock(
        request,
        f"unstructured.partition.auto.partition_{routing_func}",
        return_value=[Text("text 1"), Text("text 2")],
    )
    mock_partition_with_extras_map = {routing_func: partition_fn_}
    monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)

    elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)

    assert len(elements) == 2
    assert all(el.metadata.filetype == expected for el in elements)


@pytest.mark.parametrize(
    ("content_type", "expected"),
    [
        ("application/pdf", FILETYPE_TO_MIMETYPE[FileType.PDF]),
        (None, FILETYPE_TO_MIMETYPE[FileType.PDF]),
    ],
)
def test_auto_filetype_overrides_file_specific(
    request: FixtureRequest, content_type: str | None, expected: str, monkeypatch: MonkeyPatch
):
    pdf_metadata = ElementMetadata(filetype="imapdf")
    partition_pdf_ = function_mock(
        request,
        "unstructured.partition.auto.partition_pdf",
        return_value=[Text("text 1", metadata=pdf_metadata), Text("text 2", metadata=pdf_metadata)],
    )
    mock_partition_with_extras_map = {"pdf": partition_pdf_}
    monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)

    elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)

    assert len(elements) == 2
    assert all(el.metadata.filetype == expected for el in elements)


@pytest.mark.parametrize("filetype", supported_filetypes)
def test_file_specific_produces_correct_filetype(filetype: FileType):
    if filetype in auto.IMAGE_FILETYPES or filetype in (FileType.WAV, FileType.EMPTY):
        pytest.skip()
    extension = filetype.name.lower()
    filetype_module = FILETYPE_TO_MODULE.get(filetype, extension)
    fun_name = "partition_" + filetype_module
    module = import_module(f"unstructured.partition.{filetype_module}")
    fun = getattr(module, fun_name)
    for file in pathlib.Path(example_doc_path("")).iterdir():
        if file.is_file() and file.suffix == f".{extension}":
            elements = fun(str(file))
            assert all(
                el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype]
                for el in elements
                if el.metadata.filetype is not None
            )
            break


# -- .languages -----------------------------------------------------------


def test_auto_partition_element_metadata_user_provided_languages():
    elements = partition(
        example_doc_path("chevron-page.pdf"),
        strategy=PartitionStrategy.OCR_ONLY,
        languages=["eng"],
    )
    assert elements[0].metadata.languages == ["eng"]


def test_partition_languages_incorrectly_defaults_to_English(tmp_path: pathlib.Path):
    # -- We don't totally rely on langdetect for short text, so text like the following that is
    # -- in German will be labeled as English.
    german = "Ein kurzer Satz."
    filepath = str(tmp_path / "short-german.txt")
    with open(filepath, "w") as f:
        f.write(german)
    elements = partition(filepath)
    assert elements[0].metadata.languages == ["eng"]


def test_partition_languages_default_to_None():
    elements = partition(example_doc_path("handbook-1p.docx"), detect_language_per_element=True)
    # PageBreak and other elements with no text will have `None` for `languages`
    none_langs = [element for element in elements if element.metadata.languages is None]
    assert none_langs[0].text == ""


def test_partition_default_does_not_overwrite_other_defaults():
    """`partition()` ["eng"] default does not overwrite ["auto"] default in other partitioners."""
    # the default for `languages` is ["auto"] in partiton_text
    from unstructured.partition.text import partition_text

    # Use a document that is primarily in a language other than English
    file_path = example_doc_path("language-docs/UDHR_first_article_all.txt")
    text_elements = partition_text(file_path)
    assert text_elements[0].metadata.languages != ["eng"]

    auto_elements = partition(file_path)
    assert auto_elements[0].metadata.languages != ["eng"]
    assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages


# ================================================================================================
# MISCELLANEOUS BEHAVIORS
# ================================================================================================


def test_auto_partition_works_on_empty_filename():
    assert partition(example_doc_path("empty.txt")) == []


def test_auto_partition_works_on_empty_file():
    with open(example_doc_path("empty.txt"), "rb") as f:
        assert partition(file=f) == []


def test_get_partition_with_extras_prompts_for_install_if_missing():
    partition_with_extras_map: dict[str, Callable[..., list[Element]]] = {}
    with pytest.raises(ImportError) as exception_info:
        _get_partition_with_extras("pdf", partition_with_extras_map)

    msg = str(exception_info.value)
    assert 'Install the pdf dependencies with pip install "unstructured[pdf]"' in msg
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
+								# pyright: reportPrivateUsage=false
-												rfctr: flatten test_unstructured/partition (#3073)

**Summary**
Some partitioner test modules are placed in directories by themselves or
with one other test module. This unnecessarily obscures where to find
the test module corresponding to a partitiner.

Move partitioner test modules to mirror the directory structure of
`unstructured/partition`.
											
										
										
											2024-05-22 17:51:08 -07:00
+								from __future__ import annotations
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								import json
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								import os
 								import pathlib
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								import sys
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								import tempfile
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
+								import warnings
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								from importlib import import_module
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								from typing import Callable, Iterator, cast
-												Feat: return base64 encoded images for PDF's (#2310)

Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition

elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										
										
											2023-12-26 21:39:01 -08:00
+								from unittest.mock import Mock, patch
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								import docx
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								import pytest
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								from docx.document import Document
-												enhancement: add support from bitmap images (#2414)

### Summary

Adds support for bitmap images (`.bmp`) in both file detection and
partitioning. Bitmap images will be processed with `partition_image`
just like JPGs and PNGs.

### Testing

```python
from unstructured.file_utils.filetype import detect_filetype
from unstructured.partition.auto import partition
from PIL import Image

filename = "example-docs/layout-parser-paper-with-table.jpg"
bmp_filename = "~/tmp/ayout-parser-paper-with-table.bmp"

img = Image.open(filename)
img.save(bmp_filename)

detect_filetype(filename=bmp_filename) # Should be FileType.BMP

elements = partition(filename=bmp_filename)
```
											
										
										
											2024-01-17 17:50:36 -05:00
+								from PIL import Image
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
-												fix: stop csv and tsv dropping the first line of the file (#1530)

The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.

Here is a snippet of code that demonstrates the current behavior and the
proposed fix

```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring

c1 = """
    Stanley Cups,,
    Team,Location,Stanley Cups
    Blues,STL,1
    Flyers,PHI,2
    Maple Leafs,TOR,13
    """

f = "./test.csv"
with open(f, 'w') as ff:
    ff.write(c1)
  
print("Suggested Improvement Keep First Line") 
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)

print("\n\nOriginal Looses First Line") 
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-17 00:59:35 +02:00
+								from test_unstructured.partition.test_constants import (
 								    EXPECTED_TABLE,
 								    EXPECTED_TABLE_XLSX,
 								    EXPECTED_TEXT,
 								    EXPECTED_TEXT_XLSX,
 								    EXPECTED_TITLE,
 								)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								from test_unstructured.unit_utils import (
 								    ANY,
 								    FixtureRequest,
 								    LogCaptureFixture,
 								    MonkeyPatch,
 								    example_doc_path,
 								    function_mock,
 								    method_mock,
 								)
-												chunk_by_title decorator (#1304)

### Summary

Partial solution to #1185.
Related to #1222.
Creates decorator from `chunk_by_title` cleaning brick.
Breaks a document into sections based on the presence of Title elements.
Also starts a new section under the following conditions:

- If metadata changes, indicating a change in section or page or a
switch to processing attachments. If `multipage_sections=True`, sections
can span pages. `multipage_sections` defaults to True.
- If the length of the section exceeds `new_after_n_chars` characters.
The default is 1500. The **chunking function does not split individual
elements**, so it's possible for a section to exceed that threshold if
an individual element if over `new_after_n_chars characters`, which
could occur with a long NarrativeText element.

Combines sections under these conditions
- Sections under `combine_under_n_chars` characters are combined. The
default is 500.

### Testing

from unstructured.partition.html import partition_html

url = "https://understandingwar.org/backgrounder/russian-offensive-campaign-assessment-august-27-2023-0"
chunks = partition_html(url=url, chunking_strategy="by_title")

for chunk in chunks:
    print(chunk)
    print("\n\n" + "-"*80)
    input()

											
										
										
											2023-09-11 16:00:14 -05:00
+								from unstructured.chunking.title import chunk_by_title
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								from unstructured.cleaners.core import clean_extra_whitespace
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								from unstructured.documents.elements import (
 								    Address,
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
+								    Element,
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								    ElementMetadata,
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ListItem,
 								    NarrativeText,
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								    Table,
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								    TableChunk,
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    Text,
 								    Title,
 								)
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								from unstructured.partition import auto
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								from unstructured.partition.auto import _get_partition_with_extras, partition
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
+								from unstructured.partition.common import convert_office_doc
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								from unstructured.partition.utils.constants import PartitionStrategy
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json
-												Bug/635 unicode decode error eml (#739)

* Adds functionality to extract charset info from eml files
* Adds missed file-like object handling in detect_file_encoding
* Adds functionality to replace the MIME encodings for eml files with one of the
   common encodings if a unicode error occurs
* Organize the eml example files in the example-docs/eml directory

											
										
										
											2023-06-16 17:52:13 -07:00
-												fix: correct order of kwargs in pandoc (#421)

* fix: correct order of kwargs in pandoc

* only skip epub tests in Docker

* changelog

---------

Co-authored-by: Crag Wolfe <crag@unstructuredai.io>
Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-30 16:54:29 -04:00
+								is_in_docker = os.path.exists("/.dockerenv")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# CSV
 								# ================================================================================================
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
 								def test_auto_partition_csv_from_filename():
 								    elements = partition(example_doc_path("stanley-cups.csv"))
 								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 								    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
 								    assert elements[0].metadata.filetype == "text/csv"
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
 								def test_auto_partition_csv_from_file():
 								    with open(example_doc_path("stanley-cups.csv"), "rb") as f:
 								        elements = partition(file=f)
 								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 								    assert isinstance(elements[0], Table)
 								    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
 								    assert elements[0].metadata.filetype == "text/csv"
 								# ================================================================================================
 								# DOC
 								# ================================================================================================
 								@pytest.mark.parametrize(
 								    ("pass_metadata_filename", "content_type"),
 								    [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
 								)
 								def test_auto_partition_doc_with_filename(
 								    mock_docx_document: Document,
 								    expected_docx_elements: list[Element],
 								    tmp_path: pathlib.Path,
 								    pass_metadata_filename: bool,
 								    content_type: str | None,
 								):
 								    docx_file_path = str(tmp_path / "mock_document.docx")
 								    doc_file_path = str(tmp_path / "mock_document.doc")
 								    mock_docx_document.save(docx_file_path)
 								    convert_office_doc(docx_file_path, str(tmp_path), "doc")
 								    metadata_filename = doc_file_path if pass_metadata_filename else None
 								    elements = partition(
 								        filename=doc_file_path,
 								        metadata_filename=metadata_filename,
 								        content_type=content_type,
 								        strategy=PartitionStrategy.HI_RES,
 								    )
 								    assert elements == expected_docx_elements
 								    assert elements[0].metadata.filename == "mock_document.doc"
 								    assert elements[0].metadata.file_directory == str(tmp_path)
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								@pytest.mark.skipif(is_in_docker, reason="Passes in CI but not Docker. Remove skip on #3364 fix.")
 								@pytest.mark.xfail(sys.platform == "darwin", reason="#3364", raises=KeyError, strict=True)
 								def test_auto_partition_doc_with_file():
 								    # -- NOTE(scanny): https://github.com/Unstructured-IO/unstructured/issues/3364
 								    # -- detect_filetype() identifies .doc as `application/x-ole-storage` which is true but not
 								    # -- specific enough. The `FileType.MSG` file-type is assigned (which is also an OLE file)
 								    # -- and `partition()` routes the document to `partition_msg` which is where the `KeyError`
 								    # -- comes from.
 								    # -- For some reason, this xfail problem only occurs locally, not in CI, possibly because we
 								    # -- use two different `libmagic` sourcs (`libmagic` on CI and `libmagic1` on Mac). Doesn't
 								    # -- matter much though because when we add disambiguation they'll both get it right.
 								    with open(example_doc_path("simple.doc"), "rb") as f:
 								        elements = partition(file=f)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    assert elements == [
 								        Title("These are a few of my favorite things:"),
 								        ListItem("Parrots"),
 								        ListItem("Hockey"),
 								        Title("Analysis"),
 								        NarrativeText("This is my first thought. This is my second thought."),
 								        NarrativeText("This is my third thought."),
 								        Text("2023"),
 								        Address("DOYLESTOWN, PA 18901"),
 								    ]
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								# ================================================================================================
 								# DOCX
 								# ================================================================================================
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								@pytest.fixture()
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								def mock_docx_document():
 								    document = docx.Document()
 								    document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
 								    # NOTE(robinson) - this should get picked up as a list item due to the •
 								    document.add_paragraph("• Parrots", style="Normal")
 								    document.add_paragraph("Hockey", style="List Bullet")
 								    # NOTE(robinson) - this should get picked up as a title
 								    document.add_paragraph("Analysis", style="Normal")
 								    # NOTE(robinson) - this should get dropped because it is empty
 								    document.add_paragraph("", style="Normal")
 								    # NOTE(robinson) - this should get picked up as a narrative text
 								    document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
 								    document.add_paragraph("This is my third thought.", style="Body Text")
 								    # NOTE(robinson) - this should just be regular text
 								    document.add_paragraph("2023")
 								    return document
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								@pytest.fixture()
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								def expected_docx_elements():
 								    return [
 								        Title("These are a few of my favorite things:"),
 								        ListItem("Parrots"),
 								        ListItem("Hockey"),
 								        Title("Analysis"),
 								        NarrativeText("This is my first thought. This is my second thought."),
 								        NarrativeText("This is my third thought."),
 								        Text("2023"),
 								    ]
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_docx_with_filename(
 								    mock_docx_document: Document, expected_docx_elements: list[Element], tmp_path: pathlib.Path
 								):
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = str(tmp_path / "mock_document.docx")
 								    mock_docx_document.save(file_path)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert elements == expected_docx_elements
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    assert elements[0].metadata.filename == os.path.basename(file_path)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_docx_with_file(
 								    mock_docx_document: Document, expected_docx_elements: list[Element], tmp_path: pathlib.Path
 								):
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = str(tmp_path / "mock_document.docx")
 								    mock_docx_document.save(file_path)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    with open(file_path, "rb") as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert elements == expected_docx_elements
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								@pytest.mark.parametrize("file_name", ["simple.docx", "simple.doc", "simple.odt"])
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
+								@pytest.mark.parametrize(
 								    "strategy",
 								    [
 								        PartitionStrategy.AUTO,
 								        PartitionStrategy.FAST,
 								        PartitionStrategy.HI_RES,
 								        PartitionStrategy.OCR_ONLY,
 								    ],
 								)
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers(
 								    request: FixtureRequest, file_name: str, strategy: str
 								):
 								    """The `strategy` arg value received by `partition()` is received by `partition_docx().
 								    To do this in the brokering-partitioner cases (DOC, ODT) it must make its way to
 								    `partition_doc()` or `partition_odt()` which must then forward it to `partition_docx()`. This
 								    test makes sure it made it all the way.
 								    Note this is 3 file-types X 4 strategies = 12 test-cases.
 								    """
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
+								    from unstructured.partition.docx import _DocxPartitioner
 								    def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
 								        yield Text(f"strategy=={self._opts.strategy}")
 								    _iter_elements_ = method_mock(
 								        request,
 								        _DocxPartitioner,
 								        "_iter_document_elements",
 								        side_effect=fake_iter_document_elements,
 								    )
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								    (element,) = partition(example_doc_path(file_name), strategy=strategy)
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
 								    _iter_elements_.assert_called_once_with(ANY)
 								    assert element.text == f"strategy=={strategy}"
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# EML
 								# ================================================================================================
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								EXPECTED_EMAIL_OUTPUT = [
 								    NarrativeText(text="This is a test email to use for unit tests."),
 								    Title(text="Important points:"),
 								    ListItem(text="Roses are red"),
 								    ListItem(text="Violets are blue"),
 								]
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								def test_auto_partition_email_from_filename():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("eml/fake-email.eml")
 								    elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_EMAIL_OUTPUT
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    assert elements[0].metadata.filename == os.path.basename(file_path)
 								    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								def test_auto_partition_email_from_file():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
 								    assert len(elements) > 0
 								    assert elements == EXPECTED_EMAIL_OUTPUT
 								def test_auto_partition_eml_add_signature_to_metadata():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(example_doc_path("eml/signed-doc.p7s"))
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert len(elements) == 1
 								    assert elements[0].text == "This is a test"
 								    assert elements[0].metadata.signature == "<SIGNATURE>\n"
 								# ================================================================================================
 								# EPUB
 								# ================================================================================================
 								def test_auto_partition_epub_from_filename():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert len(elements) > 0
 								    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
 								def test_auto_partition_epub_from_file():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    with open(example_doc_path("winter-sports.epub"), "rb") as f:
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
 								    assert len(elements) > 0
 								    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
 								# ================================================================================================
 								# HTML
 								# ================================================================================================
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
 								)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content_type: str | None):
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("example-10k.html")
 								    metadata_filename = file_path if pass_metadata_filename else None
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        filename=file_path,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								        metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        content_type=content_type,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        strategy=PartitionStrategy.HI_RES,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    )
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    assert elements[0].metadata.filename == os.path.basename(file_path)
 								    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
 								)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_type: str | None):
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("fake-html.html")
 								    metadata_filename = file_path if pass_metadata_filename else None
 								    with open(file_path, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(
 								            file=f,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								            metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								            content_type=content_type,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								            strategy=PartitionStrategy.HI_RES,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        )
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
 								def test_auto_partition_html_from_file_rb():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    with open(example_doc_path("fake-html.html"), "rb") as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_html_pre_from_file():
 								    elements = partition(example_doc_path("fake-html-pre.htm"))
 								    assert len(elements) > 0
 								    assert "PageBreak" not in [elem.category for elem in elements]
 								    assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
 								    assert isinstance(elements[0], NarrativeText)
 								    assert elements[0].metadata.filetype == "text/html"
 								    assert elements[0].metadata.filename == "fake-html-pre.htm"
 								# ================================================================================================
 								# IMAGE
 								# ================================================================================================
 								@pytest.mark.parametrize(
 								    ("pass_metadata_filename", "content_type"),
 								    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 								)
 								def test_auto_partition_image(pass_metadata_filename: bool, content_type: str | None):
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("layout-parser-paper-fast.jpg")
 								    metadata_filename = file_path if pass_metadata_filename else None
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    elements = partition(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        filename=file_path,
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        metadata_filename=metadata_filename,
 								        content_type=content_type,
 								        strategy=PartitionStrategy.AUTO,
 								    )
 								    # should be same result as test_partition_image_default_strategy_hi_res() in test_image.py
 								    title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
 								    idx = 2
 								    assert elements[idx].text == title
 								    assert elements[idx].metadata.coordinates is not None
 								@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
 								def test_auto_partition_image_element_extraction(extract_image_block_to_payload: bool):
 								    extract_image_block_types = ["Image", "Table"]
 								    with tempfile.TemporaryDirectory() as tmpdir:
 								        elements = partition(
 								            filename=example_doc_path("embedded-images-tables.jpg"),
 								            extract_image_block_types=extract_image_block_types,
 								            extract_image_block_to_payload=extract_image_block_to_payload,
 								            extract_image_block_output_dir=tmpdir,
 								        )
 								        assert_element_extraction(
 								            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
 								        )
 								@pytest.mark.parametrize(
 								    ("pass_metadata_filename", "content_type"),
 								    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 								)
 								def test_auto_partition_jpg(pass_metadata_filename: bool, content_type: str | None):
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("layout-parser-paper-fast.jpg")
 								    metadata_filename = file_path if pass_metadata_filename else None
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    elements = partition(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        filename=file_path,
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        metadata_filename=metadata_filename,
 								        content_type=content_type,
 								        strategy=PartitionStrategy.AUTO,
 								    )
 								    assert len(elements) > 0
 								@pytest.mark.parametrize(
 								    ("pass_metadata_filename", "content_type"),
 								    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 								)
 								def test_auto_partition_jpg_from_file(pass_metadata_filename: bool, content_type: str | None):
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("layout-parser-paper-fast.jpg")
 								    metadata_filename = file_path if pass_metadata_filename else None
 								    with open(file_path, "rb") as f:
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        elements = partition(
 								            file=f,
 								            metadata_filename=metadata_filename,
 								            content_type=content_type,
 								            strategy=PartitionStrategy.AUTO,
 								        )
 								    assert len(elements) > 0
 								def test_partition_image_with_bmp_with_auto(tmp_path: pathlib.Path):
 								    bmp_filename = str(tmp_path / "example.bmp")
 								    with Image.open(example_doc_path("layout-parser-paper-with-table.jpg")) as img:
 								        img.save(bmp_filename)
 								    elements = partition(
 								        filename=bmp_filename,
 								        strategy=PartitionStrategy.HI_RES,
 								    )
 								    table = [e.metadata.text_as_html for e in elements if e.metadata.text_as_html]
 								    assert len(table) == 1
 								    assert "<table><thead><tr>" in table[0]
 								    assert "</thead><tbody><tr>" in table[0]
 								# ================================================================================================
 								# JSON
 								# ================================================================================================
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements():
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								    """Test auto-processing an unstructured json output file by filename."""
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    json_file_path = example_doc_path("spring-weather.html.json")
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								    original_file_name = "spring-weather.html"
 								    with open(json_file_path) as json_f:
 								        expected_result = json.load(json_f)
 								    partitioning_result = json.loads(
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        cast(
 								            str,
 								            elements_to_json(
 								                partition(
 								                    filename=str(json_file_path),
 								                    # -- use the original file name to get the same element IDs (hashes) --
 								                    metadata_filename=original_file_name,
 								                    strategy=PartitionStrategy.HI_RES,
 								                )
 								            ),
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								        )
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    )
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								    for elem in partitioning_result:
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        elem.pop("metadata")
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								    for elem in expected_result:
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        elem.pop("metadata")
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								    assert expected_result == partitioning_result
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_json_raises_with_unprocessable_json(tmp_path: pathlib.Path):
-												refactor: simplifies JSON detection and add tests (#975)

* refactor json detection

* version and changelog

* fix mock in test
											
										
										
											2023-07-25 15:59:45 -04:00
+								    # NOTE(robinson) - This is unprocessable because it is not a list of dicts,
 								    # per the Unstructured ISD format
 								    text = '{"hi": "there"}'
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = str(tmp_path / "unprocessable.json")
 								    with open(file_path, "w") as f:
-												refactor: simplifies JSON detection and add tests (#975)

* refactor json detection

* version and changelog

* fix mock in test
											
										
										
											2023-07-25 15:59:45 -04:00
+								        f.write(text)
 								    with pytest.raises(ValueError):
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        partition(filename=file_path)
-												refactor: simplifies JSON detection and add tests (#975)

* refactor json detection

* version and changelog

* fix mock in test
											
										
										
											2023-07-25 15:59:45 -04:00
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								@pytest.mark.xfail(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    reason=(
 								        "https://github.com/Unstructured-IO/unstructured/issues/3365"
 								        " partition_json() does not preserve original element-id or metadata"
 								    ),
 								    raises=AssertionError,
 								    strict=True,
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								)
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								def test_auto_partition_json_from_file_preserves_original_elements():
 								    file_path = example_doc_path("simple.json")
 								    original_elements = elements_from_json(file_path)
 								    with open(file_path, "rb") as f:
 								        partitioned_elements = partition(file=f)
 								    assert elements_to_dicts(partitioned_elements) == elements_to_dicts(original_elements)
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_works_with_unstructured_jsons():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(
 								        example_doc_path("spring-weather.html.json"), strategy=PartitionStrategy.HI_RES
 								    )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[0].text == "News Around NOAA"
 								def test_auto_partition_works_with_unstructured_jsons_from_file():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    with open(example_doc_path("spring-weather.html.json"), "rb") as f:
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
 								    assert elements[0].text == "News Around NOAA"
 								# ================================================================================================
 								# MD
 								# ================================================================================================
 								def test_partition_md_works_with_embedded_html():
 								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
 								    elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)
 								    assert "unstructured" in elements[0].text
 								# ================================================================================================
 								# MSG
 								# ================================================================================================
 								EXPECTED_MSG_OUTPUT = [
 								    NarrativeText(text="This is a test email to use for unit tests."),
 								    Title(text="Important points:"),
 								    ListItem(text="Roses are red"),
 								    ListItem(text="Violets are blue"),
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
+								]
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_msg_from_filename():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(example_doc_path("fake-email.msg"), strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements == EXPECTED_MSG_OUTPUT
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# ODT
 								# ================================================================================================
 								def test_auto_partition_odt_from_filename():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(example_doc_path("fake.odt"), strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[0] == Title("Lorem ipsum dolor sit amet.")
 								def test_auto_partition_odt_from_file():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    with open(example_doc_path("fake.odt"), "rb") as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								    assert elements[0] == Title("Lorem ipsum dolor sit amet.")
 								# ================================================================================================
 								# ORG
 								# ================================================================================================
 								def test_auto_partition_org_from_filename():
 								    elements = partition(example_doc_path("README.org"))
 								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/org"
 								def test_auto_partition_org_from_file():
 								    with open(example_doc_path("README.org"), "rb") as f:
 								        elements = partition(file=f, content_type="text/org")
 								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/org"
 								# ================================================================================================
 								# PDF
 								# ================================================================================================
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
 								)
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
 								    file_path = example_doc_path("layout-parser-paper-fast.pdf")
 								    metadata_filename = file_path if pass_metadata_filename else None
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        filename=file_path,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								        metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        content_type=content_type,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        strategy=PartitionStrategy.HI_RES,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    )
-												chore: return `Element` objects in `partition_pdf` and `partition_image` (#164)

* helper function to convert to element

* test for element types

* fix for healthcheck url

* version bump

* note on coordinates

* mention FigureCaption

* test_shared -> test_common

* add check boxes for checkbox template

* update changelog
											
										
										
											2023-01-19 09:29:28 -05:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    # NOTE(scanny): gave up trying to figure out why, but this file partitions differently locally
 								    # (on Mac) than it does in CI. Basically the first element when partitioning locally is split
 								    # in two when partitioning on CI. Other than that split the text is exactly the same.
 								    idx = 2 if sys.platform == "darwin" else 3
-												chore: return `Element` objects in `partition_pdf` and `partition_image` (#164)

* helper function to convert to element

* test for element types

* fix for healthcheck url

* version bump

* note on coordinates

* mention FigureCaption

* test_shared -> test_common

* add check boxes for checkbox template

* update changelog
											
										
										
											2023-01-19 09:29:28 -05:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    e = elements[idx]
 								    assert isinstance(e, Title)
 								    assert e.text.startswith("LayoutParser")
 								    assert e.metadata.filename == os.path.basename(file_path)
 								    assert e.metadata.file_directory == os.path.split(file_path)[0]
-												feat: add metadata tracking to document elements (#225)

* add metadata field to elements

* metadata tracking for pdf/image

* metadata for html

* update expected outputs

* metadata for the rest of the document types

* take out file metadata for now

* add url to tables

* added metadata to test_auto

* bump version

* added coordinates to __init__

* fix coordinates in tests
											
										
										
											2023-02-15 13:26:20 -05:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    e = elements[idx + 1]
 								    assert isinstance(e, NarrativeText)
 								    assert e.text.startswith("Zejiang Shen")
-												build(deps): update inference version (#662)

Updated to the the latest version of unstructured-inference. detectron2 now gets implemented with onnxruntime, yay!

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
											
										
										
											2023-05-31 13:50:15 -05:00
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												feat: extract tables (#503)

Exposes table extraction through partition and partition_pdf.
											
										
										
											2023-04-21 12:01:29 -05:00
+								def test_auto_partition_pdf_uses_table_extraction():
 								    with patch(
-												Refactor: support merging `extracted` layout with `inferred` layout (#2158)

### Summary
This PR is the second part of `pdfminer` refactor to move it from
`unstructured-inference` repo to `unstructured` repo, the first part is
done in
https://github.com/Unstructured-IO/unstructured-inference/pull/294. This
PR adds logic to merge the extracted layout with the inferred layout.

The updated workflow for the `hi_res` strategy:
* pass the document (as data/filename) to the `inference` repo to get
`inferred_layout` (DocumentLayout)
* pass the `inferred_layout` returned from the `inference` repo and the
document (as data/filename) to the `pdfminer_processing` module, which
first opens the document (create temp file/dir as needed), and splits
the document by pages
* if is_image is `True`, return the passed
inferred_layout(DocumentLayout)
  * if is_image is `False`:
* get extracted_layout (TextRegions) from the passed
document(data/filename) by pdfminer
* merge `extracted_layout` (TextRegions) with the passed
`inferred_layout` (DocumentLayout)
* return the `inferred_layout `(DocumentLayout) with updated elements
(all merged LayoutElements) as merged_layout (DocumentLayout)
* pass merged_layout and the document (as data/filename) to the `OCR`
module, which first opens the document (create temp file/dir as needed),
and splits the document by pages (convert PDF pages to image pages for
PDF file)

### Note
This PR also fixes issue #2164 by using functionality similar to the one
implemented in the `fast` strategy workflow when extracting elements by
`pdfminer`.

### TODO
* image extraction refactor to move it from `unstructured-inference`
repo to `unstructured` repo
* improving natural reading order by applying the current default
`xycut` sorting to the elements extracted by `pdfminer`
											
										
										
											2023-12-01 12:56:31 -08:00
+								        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
-												feat: extract tables (#503)

Exposes table extraction through partition and partition_pdf.
											
										
										
											2023-04-21 12:01:29 -05:00
+								    ) as mock_process_file_with_model:
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        partition(
 								            example_doc_path("layout-parser-paper-fast.pdf"),
 								            pdf_infer_table_structure=True,
 								            strategy=PartitionStrategy.HI_RES,
 								        )
-												fix: decide table extraction (#3090)

This PR aims to add backward compatibility for the deprecated
`pdf_infer_table_structure` parameter. A missing part of turning table
extraction for PDFs and Images off by default in
https://github.com/Unstructured-IO/unstructured/pull/3035, which was
turned on in https://github.com/Unstructured-IO/unstructured/pull/2588.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-05-23 13:37:15 -07:00
+								        assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
-												feat: extract tables (#503)

Exposes table extraction through partition and partition_pdf.
											
										
										
											2023-04-21 12:01:29 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_pdf_with_fast_strategy(monkeypatch: MonkeyPatch):
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("layout-parser-paper-fast.pdf")
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
 								    mock_return = [NarrativeText("Hello there!")]
 								    with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								        mock_partition_with_extras_map = {"pdf": mock_partition}
 								        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        partition(filename=file_path, strategy=PartitionStrategy.FAST)
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
 								    mock_partition.assert_called_once_with(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        filename=file_path,
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								        file=None,
 								        url=None,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        strategy=PartitionStrategy.FAST,
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
+								        languages=None,
-												Feat: return base64 encoded images for PDF's (#2310)

Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition

elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										
										
											2023-12-26 21:39:01 -08:00
+								        metadata_filename=None,
 								        include_page_breaks=False,
-												BREAKING CHANGE: revert table extraction off by default for PDFs and images (#3035)

### Summary

Closes #3021 . Turns table extraction for PDFs and images off by
default. The default behavior originally changed in #2588 . The reason
for reversion is that some users did not realize turning off table
extraction was an option and experience long processing times for PDFs
and images with the new default behavior.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
											
										
										
											2024-05-17 11:28:11 -04:00
+								        infer_table_structure=False,
-												Feat: return base64 encoded images for PDF's (#2310)

Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition

elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										
										
											2023-12-26 21:39:01 -08:00
+								        extract_images_in_pdf=False,
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								        extract_image_block_types=None,
 								        extract_image_block_output_dir=None,
 								        extract_image_block_to_payload=False,
-												chore: add hi_res_model_name kwarg (#2289)

Closes #2160 

Explicitly adds `hi_res_model_name` as kwarg to relevant functions and
notes that `model_name` is to be deprecated.

Testing:
```
from unstructured.partition.auto import partition
filename = "example-docs/DA-1p.pdf"
elements = partition(filename, strategy="hi_res", hi_res_model_name="yolox")
```

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Steve Canny <stcanny@gmail.com>
Co-authored-by: Christine Straub <christinemstraub@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-12-22 09:06:54 -06:00
+								        hi_res_model_name=None,
-												feat: introduce `date_from_file_object` parameter to partitions (#2563)

Introduce `date_from_file_object` to `partition*` functions, by default
set to `False`.
If set to `True` and file is provided via `file` parameter, partition
will attempt to infer last modified date from `file`'s contents
otherwise last modified metadata will be set to `None`.

---------

Co-authored-by: Filip Knefel <filip@unstructured.io>
Co-authored-by: Ronny H <138828701+ron-unstructured@users.noreply.github.com>
											
										
										
											2024-03-18 02:09:44 +01:00
+								        date_from_file_object=False,
-												Introduce `start_page` argument to partitioning functions that assign `element.metadata.page_number` (#2884)

This small change will be useful for users who partition only fragments
of their PDF documents.
It's a small step towards addressing this issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

Related PRs:
* https://github.com/Unstructured-IO/unstructured/pull/2842
* https://github.com/Unstructured-IO/unstructured/pull/2673
											
										
										
											2024-04-15 23:03:42 +02:00
+								        starting_page_number=1,
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								    )
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
 								)
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
 								    file_path = example_doc_path("layout-parser-paper-fast.pdf")
 								    metadata_filename = file_path if pass_metadata_filename else None
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    with open(file_path, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(
 								            file=f,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								            metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								            content_type=content_type,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								            strategy=PartitionStrategy.HI_RES,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        )
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    # NOTE(scanny): see "with_filename" version of this test above for more on this oddness
 								    idx = 2 if sys.platform == "darwin" else 3
-												build: wolfi base image for Dockerfile (#3016)

### Summary

Updates the `Dockerfile` to use the Chainguard `wolfi-base` image to
reduce CVEs. Also adds a step in the docker publish job that scans the
images and checks for CVEs before publishing. The job will fail if there
are high or critical vulnerabilities.

### Testing

Run `make docker-run-dev` and then `python3.11` once you're in. And that
point, you can try:

```python
from unstructured.partition.auto import partition
elements = partition(filename="example-docs/DA-1p.pdf", skip_infer_table_types=["pdf"])
elements
```

Stop the container once you're done.
											
										
										
											2024-05-15 18:53:15 -04:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    e = elements[idx]
 								    assert isinstance(e, Title)
 								    assert e.text.startswith("LayoutParser")
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    e = elements[idx + 1]
 								    assert isinstance(e, NarrativeText)
 								    assert e.text.startswith("Zejiang Shen")
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_partition_pdf_does_not_raise_warning():
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
+								    # NOTE(robinson): This is the recommended way to check that no warning is emitted,
 								    # per the pytest docs.
 								    # ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
 								    #      #additional-use-cases-of-warnings-in-tests
 								    with warnings.catch_warnings():
 								        warnings.simplefilter("error")
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        partition(
 								            example_doc_path("layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES
 								        )
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: bool):
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								    extract_image_block_types = ["Image", "Table"]
 								    with tempfile.TemporaryDirectory() as tmpdir:
 								        elements = partition(
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								            example_doc_path("embedded-images-tables.pdf"),
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								            extract_image_block_types=extract_image_block_types,
 								            extract_image_block_to_payload=extract_image_block_to_payload,
 								            extract_image_block_output_dir=tmpdir,
 								        )
 								        assert_element_extraction(
 								            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
 								        )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# PPT
 								# ================================================================================================
-												feat: partition image (#144)

Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
											
										
										
											2023-01-13 22:24:13 -06:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
 								def test_auto_partition_ppt_from_filename():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("fake-power-point.ppt")
 								    elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements == EXPECTED_PPTX_OUTPUT
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    assert elements[0].metadata.filename == os.path.basename(file_path)
 								    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
-												feat: partition image (#144)

Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
											
										
										
											2023-01-13 22:24:13 -06:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# PPTX
 								# ================================================================================================
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
 								EXPECTED_PPTX_OUTPUT = [
 								    Title(text="Adding a Bullet Slide"),
 								    ListItem(text="Find the bullet slide layout"),
 								    ListItem(text="Use _TextFrame.text for first bullet"),
 								    ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
 								    NarrativeText(text="Here is a lot of text!"),
 								    NarrativeText(text="Here is some text in a text box!"),
 								]
 								def test_auto_partition_pptx_from_filename():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("fake-power-point.pptx")
 								    elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    assert elements == EXPECTED_PPTX_OUTPUT
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    assert elements[0].metadata.filename == os.path.basename(file_path)
 								    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
-												feat: optional page breaks for `.pptx`, `.pdf`, `.html` and images (#205)

* page breaks for pptx

* added page breaks for image/pdf

* tests for images with page breaks

* page breaks for html documents

* linting, linting, linting

* changelog and bump version

* update docs

* fix typo

* refactor reusable code to common.py

* add type back in
											
										
										
											2023-02-08 10:11:15 -05:00
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								@pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"])
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
+								@pytest.mark.parametrize(
 								    "strategy",
 								    [
 								        PartitionStrategy.AUTO,
 								        PartitionStrategy.FAST,
 								        PartitionStrategy.HI_RES,
 								        PartitionStrategy.OCR_ONLY,
 								    ],
 								)
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								def test_partition_forwards_strategy_arg_to_partition_pptx_and_its_brokers(
 								    request: FixtureRequest, file_name: str, strategy: str
 								):
 								    """The `strategy` arg value received by `partition()` is received by `partition_pptx().
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								    To do this in the brokering-partitioner case (PPT) the strategy argument must make its way to
 								    `partition_ppt()` which must then forward it to `partition_pptx()`. This test makes sure it
 								    made it all the way.
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								    Note this is 2 file-types X 4 strategies = 8 test-cases.
 								    """
-												Feat/pass down strategy to partition ppt as well (#3274)

Following the same pattern of
https://github.com/Unstructured-IO/unstructured/pull/3273 and pass down
`strategy` parameter to `partition_ppt` as well.
											
										
										
											2024-06-21 21:23:58 -05:00
+								    from unstructured.partition.pptx import _PptxPartitioner
 								    def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
 								        yield Text(f"strategy=={self._opts.strategy}")
 								    _iter_elements_ = method_mock(
 								        request,
 								        _PptxPartitioner,
 								        "_iter_presentation_elements",
 								        side_effect=fake_iter_presentation_elements,
 								    )
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								    (element,) = partition(example_doc_path(file_name), strategy=strategy)
-												Feat/pass down strategy to partition ppt as well (#3274)

Following the same pattern of
https://github.com/Unstructured-IO/unstructured/pull/3273 and pass down
`strategy` parameter to `partition_ppt` as well.
											
										
										
											2024-06-21 21:23:58 -05:00
 								    _iter_elements_.assert_called_once_with(ANY)
 								    assert element.text == f"strategy=={strategy}"
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# RST
 								# ================================================================================================
-												feat: add `partition_epub` function (#364)

* add pypandoc dependency

* added epub partitioner and file conversion

* test for partition_epub

* tests for file conversion

* add epub to filetype detection

* added epub to auto partition

* update bricks docs

* updated installing docs

* changelot and version

* add pandoc to dependencies

* add pandoc to debian dependencies

* linting, linting, linting

* typo fix

* typo fix

* file conversion type hints

* more type hints

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-03-14 11:52:21 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_rst_from_filename():
 								    elements = partition(example_doc_path("README.rst"))
-												feat: add `partition_epub` function (#364)

* add pypandoc dependency

* added epub partitioner and file conversion

* test for partition_epub

* tests for file conversion

* add epub to filetype detection

* added epub to auto partition

* update bricks docs

* updated installing docs

* changelot and version

* add pandoc to dependencies

* add pandoc to debian dependencies

* linting, linting, linting

* typo fix

* typo fix

* file conversion type hints

* more type hints

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-03-14 11:52:21 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/x-rst"
-												feat: add `partition_epub` function (#364)

* add pypandoc dependency

* added epub partitioner and file conversion

* test for partition_epub

* tests for file conversion

* add epub to filetype detection

* added epub to auto partition

* update bricks docs

* updated installing docs

* changelot and version

* add pandoc to dependencies

* add pandoc to debian dependencies

* linting, linting, linting

* typo fix

* typo fix

* file conversion type hints

* more type hints

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-03-14 11:52:21 -04:00
-												feat: add `partition_msg` for MSFT Outlook files (#412)

* added msg-parser dependency

* pass through kwargs in convert_file_to_text

* added partition_msg for processing msft outlook files

* version bump and changelog

* added tests for partition_msg

* added test for msg with plain text

* add partition_msg docs; fix underlines in integration docs

* add .msg to file list

* finish tests for auto msg

* linting, linting, linting
											
										
										
											2023-03-28 16:15:22 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_rst_from_file():
 								    with open(example_doc_path("README.rst"), "rb") as f:
 								        elements = partition(file=f, content_type="text/x-rst")
-												feat: add `partition_msg` for MSFT Outlook files (#412)

* added msg-parser dependency

* pass through kwargs in convert_file_to_text

* added partition_msg for processing msft outlook files

* version bump and changelog

* added tests for partition_msg

* added test for msg with plain text

* add partition_msg docs; fix underlines in integration docs

* add .msg to file list

* finish tests for auto msg

* linting, linting, linting
											
										
										
											2023-03-28 16:15:22 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/x-rst"
-												feat: add `partition_msg` for MSFT Outlook files (#412)

* added msg-parser dependency

* pass through kwargs in convert_file_to_text

* added partition_msg for processing msft outlook files

* version bump and changelog

* added tests for partition_msg

* added test for msg with plain text

* add partition_msg docs; fix underlines in integration docs

* add .msg to file list

* finish tests for auto msg

* linting, linting, linting
											
										
										
											2023-03-28 16:15:22 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# RTF
 								# ================================================================================================
-												feat: add `partition_rtf` for rich text files (#466)

* refactor epub; add rtf

* added test for rtf files

* filetype detection for rtf files

* add rtf to auto

* update docs for group_broken_paragraphs

* add rtf to docs

* update file list in readme

* update stage_for_transformers docs

* changelog and version bump

* skip rtf if in docker

* skip test if rtf not supported

* docs tweaks
											
										
										
											2023-04-10 17:25:03 -04:00
 								def test_auto_partition_rtf_from_filename():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(example_doc_path("fake-doc.rtf"), strategy=PartitionStrategy.HI_RES)
-												feat: add `partition_rtf` for rich text files (#466)

* refactor epub; add rtf

* added test for rtf files

* filetype detection for rtf files

* add rtf to auto

* update docs for group_broken_paragraphs

* add rtf to docs

* update file list in readme

* update stage_for_transformers docs

* changelog and version bump

* skip rtf if in docker

* skip test if rtf not supported

* docs tweaks
											
										
										
											2023-04-10 17:25:03 -04:00
+								    assert elements[0] == Title("My First Heading")
-												feat: add `url` kwarg to `partititon` (#470)

* added url option to auto partition

* add test for partition from url

* version and changelog

* update docs

* add url to element metadata
											
										
										
											2023-04-12 14:31:01 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# TSV
 								# ================================================================================================
-												feat: allow headers in `partition` (#473)

* feat: allow headers in `partition`

* warning if header is set and url is not

* update emoji test
											
										
										
											2023-04-13 11:04:15 -04:00
-												fix: parse URL response Content-Type according to RFC 9110 (#2950)

Currently, `file_and_type_from_url()` does not correctly handle the
`Content-Type` header. Specifically, it assumes that the header contains
only the mime-type (e.g. `text/html`), however, [RFC
9110](https://www.rfc-editor.org/rfc/rfc9110#field.content-type) allows
for additional directives — specifically the `charset` — to be returned
in the header. This leads to a `ValueError` when loading a URL with a
response Content-Type header such as `text/html; charset=UTF-8`.

To reproduce the issue:

```python
from unstructured.partition.auto import partition

url = "https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/"
partition(url=url)
```

Which will result in the following exception:

```python
{
	"name": "ValueError",
	"message": "Invalid file. The FileType.UNK file type is not supported in partition.",
	"stack": "---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[1], line 4
      1 from unstructured.partition.auto import partition
      3 url = \"https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/\"
----> 4 partition(url=url)

File ~/miniconda3/envs/ai-tasks/lib/python3.11/site-packages/unstructured/partition/auto.py:541, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, hi_res_model_name, model_name, date_from_file_object, starting_page_number, **kwargs)
    539 else:
    540     msg = \"Invalid file\" if not filename else f\"Invalid file {filename}\"
--> 541     raise ValueError(f\"{msg}. The {filetype} file type is not supported in partition.\")
    543 for element in elements:
    544     element.metadata.url = url

ValueError: Invalid file. The FileType.UNK file type is not supported in partition."
}
```

This PR fixes the issue by parsing the mime-type out of the
`Content-Type` header string.


Closes #2257
											
										
										
											2024-04-30 07:53:44 +02:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
 								def test_auto_partition_tsv_from_filename():
 								    elements = partition(example_doc_path("stanley-cups.tsv"))
-												fix: parse URL response Content-Type according to RFC 9110 (#2950)

Currently, `file_and_type_from_url()` does not correctly handle the
`Content-Type` header. Specifically, it assumes that the header contains
only the mime-type (e.g. `text/html`), however, [RFC
9110](https://www.rfc-editor.org/rfc/rfc9110#field.content-type) allows
for additional directives — specifically the `charset` — to be returned
in the header. This leads to a `ValueError` when loading a URL with a
response Content-Type header such as `text/html; charset=UTF-8`.

To reproduce the issue:

```python
from unstructured.partition.auto import partition

url = "https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/"
partition(url=url)
```

Which will result in the following exception:

```python
{
	"name": "ValueError",
	"message": "Invalid file. The FileType.UNK file type is not supported in partition.",
	"stack": "---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[1], line 4
      1 from unstructured.partition.auto import partition
      3 url = \"https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/\"
----> 4 partition(url=url)

File ~/miniconda3/envs/ai-tasks/lib/python3.11/site-packages/unstructured/partition/auto.py:541, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, hi_res_model_name, model_name, date_from_file_object, starting_page_number, **kwargs)
    539 else:
    540     msg = \"Invalid file\" if not filename else f\"Invalid file {filename}\"
--> 541     raise ValueError(f\"{msg}. The {filetype} file type is not supported in partition.\")
    543 for element in elements:
    544     element.metadata.url = url

ValueError: Invalid file. The FileType.UNK file type is not supported in partition."
}
```

This PR fixes the issue by parsing the mime-type out of the
`Content-Type` header string.


Closes #2257
											
										
										
											2024-04-30 07:53:44 +02:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 								    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
 								    assert elements[0].metadata.filetype == "text/tsv"
-												fix: parse URL response Content-Type according to RFC 9110 (#2950)

Currently, `file_and_type_from_url()` does not correctly handle the
`Content-Type` header. Specifically, it assumes that the header contains
only the mime-type (e.g. `text/html`), however, [RFC
9110](https://www.rfc-editor.org/rfc/rfc9110#field.content-type) allows
for additional directives — specifically the `charset` — to be returned
in the header. This leads to a `ValueError` when loading a URL with a
response Content-Type header such as `text/html; charset=UTF-8`.

To reproduce the issue:

```python
from unstructured.partition.auto import partition

url = "https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/"
partition(url=url)
```

Which will result in the following exception:

```python
{
	"name": "ValueError",
	"message": "Invalid file. The FileType.UNK file type is not supported in partition.",
	"stack": "---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[1], line 4
      1 from unstructured.partition.auto import partition
      3 url = \"https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/\"
----> 4 partition(url=url)

File ~/miniconda3/envs/ai-tasks/lib/python3.11/site-packages/unstructured/partition/auto.py:541, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, hi_res_model_name, model_name, date_from_file_object, starting_page_number, **kwargs)
    539 else:
    540     msg = \"Invalid file\" if not filename else f\"Invalid file {filename}\"
--> 541     raise ValueError(f\"{msg}. The {filetype} file type is not supported in partition.\")
    543 for element in elements:
    544     element.metadata.url = url

ValueError: Invalid file. The FileType.UNK file type is not supported in partition."
}
```

This PR fixes the issue by parsing the mime-type out of the
`Content-Type` header string.


Closes #2257
											
										
										
											2024-04-30 07:53:44 +02:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# TXT
 								# ================================================================================================
-												fix: updates markdown code to process markdown with embedded html (#480)

* add carriage return to html if missing

* test on markdown with embedded html

* changelog and version

* check for html parser

* linting, linting, linting
											
										
										
											2023-04-13 12:47:45 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								EXPECTED_TEXT_OUTPUT = [
 								    NarrativeText(text="This is a test document to use for unit tests."),
 								    Address(text="Doylestown, PA 18901"),
 								    Title(text="Important points:"),
 								    ListItem(text="Hamburgers are delicious"),
 								    ListItem(text="Dogs are the best"),
 								    ListItem(text="I love fuzzy blankets"),
 								]
-												fix: update `detect_filetype` for JSONs with text/plain MIME type (#520)

* check to see if text file is a json

* add json check into filetype detection

* added test for updated file detection logic

* bytes/strings handling

* changlog and version bump
											
										
										
											2023-04-26 13:52:47 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_text_from_filename():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("fake-text.txt")
 								    elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_TEXT_OUTPUT
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    assert elements[0].metadata.filename == os.path.basename(file_path)
 								    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
-												fix: update `detect_filetype` for JSONs with text/plain MIME type (#520)

* check to see if text file is a json

* add json check into filetype detection

* added test for updated file detection logic

* bytes/strings handling

* changlog and version bump
											
										
										
											2023-04-26 13:52:47 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_text_from_file():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    with open(example_doc_path("fake-text.txt"), "rb") as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_TEXT_OUTPUT
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# XLS
 								# ================================================================================================
-												Introduce `start_page` argument to partitioning functions that assign `element.metadata.page_number` (#2884)

This small change will be useful for users who partition only fragments
of their PDF documents.
It's a small step towards addressing this issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

Related PRs:
* https://github.com/Unstructured-IO/unstructured/pull/2842
* https://github.com/Unstructured-IO/unstructured/pull/2673
											
										
										
											2024-04-15 23:03:42 +02:00
-												feat: xlsx subtable extraction (#1585)

**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.

**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text

**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">

```
[
    {
        "type": "Title",
        "element_id": "3315afd97f7f2ebcd450e7c939878429",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Financial performance"
    },
    {
        "type": "Table",
        "element_id": "17f5d512705be6f8812e5dbb801ba727",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "8a9db7161a02b427f8fda883656036e1",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Operational metrics"
    },
    {
        "type": "Table",
        "element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Other"
    },
    {
        "type": "Table",
        "element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
    }
]
```
											
										
										
											2023-10-04 13:30:23 -04:00
+								EXPECTED_XLS_TEXT_LEN = 550
-												build(release): bump unstructured-inference (#1074)

* build(release): bump unstructured-inference

Related to downstream issue:
Unstructured-IO/unstructured-api#182

And upstream PR:
Unstructured-IO/unstructured-inference#165

---------

Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
											
										
										
											2023-08-10 13:57:46 -07:00
-												feat: xlsx subtable extraction (#1585)

**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.

**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text

**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">

```
[
    {
        "type": "Title",
        "element_id": "3315afd97f7f2ebcd450e7c939878429",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Financial performance"
    },
    {
        "type": "Table",
        "element_id": "17f5d512705be6f8812e5dbb801ba727",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "8a9db7161a02b427f8fda883656036e1",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Operational metrics"
    },
    {
        "type": "Table",
        "element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Other"
    },
    {
        "type": "Table",
        "element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
    }
]
```
											
										
										
											2023-10-04 13:30:23 -04:00
+								EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MC What is 2+2? 4 correct 3 incorrect MA What"
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
 								EXPECTED_XLS_TABLE = (
 								    """<table border="1" class="dataframe">
 								  <tbody>
-												feat: xlsx subtable extraction (#1585)

**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.

**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text

**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">

```
[
    {
        "type": "Title",
        "element_id": "3315afd97f7f2ebcd450e7c939878429",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Financial performance"
    },
    {
        "type": "Table",
        "element_id": "17f5d512705be6f8812e5dbb801ba727",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "8a9db7161a02b427f8fda883656036e1",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Operational metrics"
    },
    {
        "type": "Table",
        "element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Other"
    },
    {
        "type": "Table",
        "element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
    }
]
```
											
										
										
											2023-10-04 13:30:23 -04:00
+								    <tr>
 								      <td>MC</td>
 								      <td>What is 2+2?</td>
 								      <td>4</td>
 								      <td>correct</td>
 								      <td>3</td>
 								      <td>incorrect</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
+								    <tr>
 								      <td>MA</td>
 								      <td>What C datatypes are 8 bits? (assume i386)</td>
 								      <td>int</td>
 								      <td></td>
 								      <td>float</td>
 								      <td></td>
 								      <td>double</td>
 								      <td></td>
 								      <td>char</td>
 								    </tr>
 								    <tr>
 								      <td>TF</td>
 								      <td>Bagpipes are awesome.</td>
 								      <td>true</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>ESS</td>
 								      <td>How have the original Henry Hornbostel buildings """
 								    """influenced campus architecture and design in the last 30 years?</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>ORD</td>
 								      <td>Rank the following in their order of operation.</td>
 								      <td>Parentheses</td>
 								      <td>Exponents</td>
 								      <td>Division</td>
 								      <td>Addition</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>FIB</td>
 								      <td>The student activities fee is</td>
 								      <td>95</td>
 								      <td>dollars for students enrolled in</td>
 								      <td>19</td>
 								      <td>units or more,</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>MAT</td>
 								      <td>Match the lower-case greek letter with its capital form.</td>
 								      <td>λ</td>
 								      <td>Λ</td>
 								      <td>α</td>
 								      <td>γ</td>
 								      <td>Γ</td>
 								      <td>φ</td>
 								      <td>Φ</td>
 								    </tr>
 								  </tbody>
 								</table>"""
 								)
-												fix: extract emojis with `partition_xlsx` (#1009)

* 🐛 fixxed emoji xlsx bug

* update version and changelog

* check if beautifulsoup exists

* update docs

* fix html parser call

* fix failing attachment test

* ✅  added emoji test, added requirment fixed dependency

* 🐛 dependency

* 🐛 correct depeendency

* linting, linting, linting

* check for bs4

* skip auto xls filename test

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
											
										
										
											2023-08-04 16:14:08 +02:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_xls_from_filename():
 								    elements = partition(
 								        example_doc_path("tests-example.xls"), include_header=False, skip_infer_table_types=[]
 								    )
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
-												feat: xlsx subtable extraction (#1585)

**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.

**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text

**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">

```
[
    {
        "type": "Title",
        "element_id": "3315afd97f7f2ebcd450e7c939878429",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Financial performance"
    },
    {
        "type": "Table",
        "element_id": "17f5d512705be6f8812e5dbb801ba727",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "3315afd97f7f2ebcd450e7c939878429",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Quarterly revenue</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Group financial performance</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Segmental results</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <td>Segmental analysis</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <td>Cash flow</td>\n      <td>FY 22</td>\n      <td>FY 23</td>\n      <td></td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "8a9db7161a02b427f8fda883656036e1",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Operational metrics"
    },
    {
        "type": "Table",
        "element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "8a9db7161a02b427f8fda883656036e1",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Mobile customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <td>Fixed broadband customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <td>Marketable homes passed</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <td>TV customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <td>Converged customers</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <td>Mobile churn</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>11</td>\n    </tr>\n    <tr>\n      <td>Mobile data usage</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <td>Mobile ARPU</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
    },
    {
        "type": "Title",
        "element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "Other"
    },
    {
        "type": "Table",
        "element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
        "metadata": {
            "filename": "vodafone.xlsx",
            "file_directory": "example-docs",
            "last_modified": "2023-10-03T17:51:34",
            "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
            "languages": [
                "spa",
                "ita"
            ],
            "page_number": 1,
            "page_name": "Index",
            "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Topic</td>\n      <td>Period</td>\n      <td></td>\n      <td></td>\n      <td>Page</td>\n    </tr>\n    <tr>\n      <td>Average foreign exchange rates</td>\n      <td>Nine quarters to 30 June 2023</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n    <tr>\n      <td>Guidance rates</td>\n      <td>FY 23/24</td>\n      <td></td>\n      <td></td>\n      <td>14</td>\n    </tr>\n  </tbody>\n</table>"
        },
        "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
    }
]
```
											
										
										
											2023-10-04 13:30:23 -04:00
+								    assert sum(isinstance(element, Table) for element in elements) == 2
-												fix(xlsx): xlsx subtable algorithm (#2534)

**Reviewers:** It may be easier to review each of the two commits
separately. The first adds the new `_SubtableParser` object with its
unit-tests and the second one uses that object to replace the flawed
existing subtable-parsing algorithm.

**Summary**

There are a cluster of bugs in `partition_xlsx()` that all derive from
flaws in the algorithm we use to detect "subtables". These are
encountered when the user wants to get multiple document-elements from
each worksheet, which is the default (argument `find_subtable = True`).

This PR replaces the flawed existing algorithm with a `_SubtableParser`
object that encapsulates all that logic and has thorough unit-tests.

**Additional Context**

This is a summary of the failure cases. There are a few other cases but
they're closely related and this was enough evidence and scope for my
purposes. This PR fixes all these bugs:
```python
    #
    # -- ✅ CASE 1: There are no leading or trailing single-cell rows.
    #       -> this subtable functions never get called, subtable is emitted as the only element
    #
    #    a b  -> Table(a, b, c, d)
    #    c d

    # -- ✅ CASE 2: There is exactly one leading single-cell row.
    #       -> Leading single-cell row emitted as `Title` element, core-table properly identified.
    #
    #    a    -> [ Title(a),
    #    b c       Table(b, c, d, e) ]
    #    d e

    # -- ❌ CASE 3: There are two-or-more leading single-cell rows.
    #       -> leading single-cell rows are included in subtable
    #
    #    a    -> [ Table(a, b, c, d, e, f) ]
    #    b
    #    c d
    #    e f

    # -- ❌ CASE 4: There is exactly one trailing single-cell row.
    #      -> core table is dropped. trailing single-cell row is emitted as Title
    #         (this is the behavior in the reported bug)
    #
    #    a b  -> [ Title(e) ]
    #    c d
    #      e

    # -- ❌ CASE 5: There are two-or-more trailing single-cell rows.
    #      -> core table is dropped. trailing single-cell rows are each emitted as a Title
    #
    #    a b  -> [ Title(e),
    #    c d       Title(f) ]
    #      e
    #      f

    # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows.
    #      -> core table is correctly identified, leading and trailing single-cell rows are each
    #         emitted as a Title.
    #
    #      a  -> [ Title(a),
    #    b c       Table(b, c, d, e),
    #    d e       Title(f) ]
    #    f

    # -- ✅ CASE 7: There are two leading and one trailing single-cell rows.
    #      -> core table is correctly identified, leading and trailing single-cell rows are each
    #         emitted as a Title.
    #
    #    a    -> [ Title(a),
    #    b         Title(b),
    #    c d       Table(c, d, e, f),
    #    e f       Title(g) ]
    #      g

    # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows.
    #      -> core table is correctly identified, leading and trailing single-cell rows are each
    #         emitted as a Title.
    #
    #      a  -> [ Title(a),
    #      b       Title(b),
    #    c d       Table(c, d, e, f),
    #    e f       Title(g),
    #    g         Title(h) ]
    #    h

    # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below.
    #      -> First cell is mistakenly emitted as title, remaining cells are dropped.
    #
    #    a b c  -> [ Title(a) ]

    # -- ❌ CASE 10: Single-row subtable with one leading single-cell row.
    #      -> Leading single-row cell is correctly identified as title, core-table is mis-identified
    #         as a `Title` and truncated.
    #
    #    a      -> [ Title(a),
    #    b c d       Title(b) ]
```
											
										
										
											2024-02-13 20:29:17 -08:00
+								    assert len(elements) == 14
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
 								    assert clean_extra_whitespace(elements[0].text)[:45] == EXPECTED_XLS_INITIAL_45_CLEAN_TEXT
-												build(release): bump unstructured-inference (#1074)

* build(release): bump unstructured-inference

Related to downstream issue:
Unstructured-IO/unstructured-api#182

And upstream PR:
Unstructured-IO/unstructured-inference#165

---------

Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
											
										
										
											2023-08-10 13:57:46 -07:00
+								    # NOTE(crag): if the beautifulsoup4 package is installed, some (but not all) additional
 								    # whitespace is removed, so the expected text length is less than is the case
 								    # when beautifulsoup4 is *not* installed. E.g.
 								    # "\n\n\nMA\nWhat C datatypes are 8 bits" vs.
 								    # '\n  \n    \n      MA\n      What C datatypes are 8 bits?... "
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
+								    assert len(elements[0].text) == EXPECTED_XLS_TEXT_LEN
 								    assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# XLSX
 								# ================================================================================================
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
-												Chore: Pass table support  param to partition image (#973)

* add param and test in image table extraction

* version and changelog

* need to publish this one for api repo

* add new param skip_infer_table_types

* use warning

* clean up with mapping

* add test for tsv

* fix test fail

* weird change from merge

* doc nit

* don't use mapping

* correct conflict
											
										
										
											2023-07-27 13:33:36 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_xlsx_from_filename():
 								    elements = partition(
 								        example_doc_path("stanley-cups.xlsx"), include_header=False, skip_infer_table_types=[]
 								    )
-												Chore: Pass table support  param to partition image (#973)

* add param and test in image table extraction

* version and changelog

* need to publish this one for api repo

* add new param skip_infer_table_types

* use warning

* clean up with mapping

* add test for tsv

* fix test fail

* weird change from merge

* doc nit

* don't use mapping

* correct conflict
											
										
										
											2023-07-27 13:33:36 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert sum(isinstance(element, Table) for element in elements) == 2
 								    assert sum(isinstance(element, Title) for element in elements) == 2
 								    assert len(elements) == 4
-												Chore: Pass table support  param to partition image (#973)

* add param and test in image table extraction

* version and changelog

* need to publish this one for api repo

* add new param skip_infer_table_types

* use warning

* clean up with mapping

* add test for tsv

* fix test fail

* weird change from merge

* doc nit

* don't use mapping

* correct conflict
											
										
										
											2023-07-27 13:33:36 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
 								    assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
 								    assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
 								    assert elements[1].metadata.page_number == 1
 								    assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
-												enhancement: handling for empty files in `detect_filetype` and `partition` (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
											
										
										
											2023-06-09 16:07:50 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_xlsx_from_file():
 								    with open(example_doc_path("stanley-cups.xlsx"), "rb") as f:
 								        elements = partition(file=f, include_header=False, skip_infer_table_types=[])
-												enhancement: handling for empty files in `detect_filetype` and `partition` (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
											
										
										
											2023-06-09 16:07:50 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert sum(isinstance(element, Table) for element in elements) == 2
 								    assert sum(isinstance(element, Title) for element in elements) == 2
 								    assert len(elements) == 4
-												feature(html partition): parse pre tag (#642)

* feature(html partition): parse pre tag

* chore: update CHANGELOG.md

* style: black format xml.py

* Added tests dor html with pre tag

* remove skip test, update parse pre tag

* fix style

* chore: spell check

* chore: update changelog & version

* chore: update ingest test fixtures

* chore: add exception handling if `element.text` is `None` in `_read_xml`

* test: add more sanity testing on the `.text` content of the element(s)

* refactor: move the conditional logic for <pre> outside of the `try/except` block

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
											
										
										
											2023-06-27 21:52:39 +03:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
 								    assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
 								    assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
 								    assert elements[1].metadata.page_number == 1
 								    assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
-												feature(html partition): parse pre tag (#642)

* feature(html partition): parse pre tag

* chore: update CHANGELOG.md

* style: black format xml.py

* Added tests dor html with pre tag

* remove skip test, update parse pre tag

* fix style

* chore: spell check

* chore: update changelog & version

* chore: update ingest test fixtures

* chore: add exception handling if `element.text` is `None` in `_read_xml`

* test: add more sanity testing on the `.text` content of the element(s)

* refactor: move the conditional logic for <pre> outside of the `try/except` block

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
											
										
										
											2023-06-27 21:52:39 +03:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_respects_starting_page_number_argument_for_xlsx():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(example_doc_path("stanley-cups.xlsx"), starting_page_number=3)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[1].metadata.page_number == 3
-												enhancement: handling for empty files in `detect_filetype` and `partition` (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
											
										
										
											2023-06-09 16:07:50 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# XML
 								# ================================================================================================
-												feat: `partition_rst` for ReStructured Text documents (#725)

* add example rst file

* filetype detection for rst files

* add partition_rst function

* add partition_rst to auto

* update readme

* update docs

* changelog and version

* pandocs -> pandoc

* fix typo
											
										
										
											2023-06-12 15:31:10 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_xml_from_filename():
 								    file_path = example_doc_path("factbook.xml")
-												feat: partition_org for Org Mode documents (#780)

* feat: partition_org for Org Mode documents

* update version
											
										
										
											2023-06-23 20:45:31 +02:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    elements = partition(file_path, xml_keep_tags=False, metadata_filename=file_path)
-												feat: partition_org for Org Mode documents (#780)

* feat: partition_org for Org Mode documents

* update version
											
										
										
											2023-06-23 20:45:31 +02:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[0].text == "United States"
 								    assert elements[0].metadata.filename == "factbook.xml"
-												feat: partition_org for Org Mode documents (#780)

* feat: partition_org for Org Mode documents

* update version
											
										
										
											2023-06-23 20:45:31 +02:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_xml_from_file():
 								    with open(example_doc_path("factbook.xml"), "rb") as f:
 								        elements = partition(file=f, xml_keep_tags=False)
-												feat: partition_org for Org Mode documents (#780)

* feat: partition_org for Org Mode documents

* update version
											
										
										
											2023-06-23 20:45:31 +02:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[0].text == "United States"
-												feat: partition_org for Org Mode documents (#780)

* feat: partition_org for Org Mode documents

* update version
											
										
										
											2023-06-23 20:45:31 +02:00
-												feat: `partition_rst` for ReStructured Text documents (#725)

* add example rst file

* filetype detection for rst files

* add partition_rst function

* add partition_rst to auto

* update readme

* update docs

* changelog and version

* pandocs -> pandoc

* fix typo
											
										
										
											2023-06-12 15:31:10 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_xml_from_filename_with_tags():
 								    elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=True)
-												feat: `partition_rst` for ReStructured Text documents (#725)

* add example rst file

* filetype detection for rst files

* add partition_rst function

* add partition_rst to auto

* update readme

* update docs

* changelog and version

* pandocs -> pandoc

* fix typo
											
										
										
											2023-06-12 15:31:10 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert "<leader>Joe Biden</leader>" in elements[0].text
 								    assert elements[0].metadata.filename == "factbook.xml"
-												feat: `partition_rst` for ReStructured Text documents (#725)

* add example rst file

* filetype detection for rst files

* add partition_rst function

* add partition_rst to auto

* update readme

* update docs

* changelog and version

* pandocs -> pandoc

* fix typo
											
										
										
											2023-06-12 15:31:10 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_xml_from_file_with_tags():
 								    with open(example_doc_path("factbook.xml"), "rb") as f:
 								        elements = partition(file=f, xml_keep_tags=True)
-												fixed filename metadata bug when using file and file_filename (#1002)


											
										
										
											2023-08-02 18:14:15 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert "<leader>Joe Biden</leader>" in elements[0].text
-												fixed filename metadata bug when using file and file_filename (#1002)


											
										
										
											2023-08-02 18:14:15 -07:00
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# FILE_TYPE NOT RECOGNIZED OR NOT SUPPORTED
 								# ================================================================================================
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
 								    detect_filetype_ = function_mock(
 								        request, "unstructured.partition.auto.detect_filetype", return_value=None
 								    )
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    with pytest.raises(ValueError):
 								        partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    detect_filetype_.assert_called_once_with(
 								        content_type=None, encoding=None, file=None, file_filename=None, filename="made-up.fake"
 								    )
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# LOAD FROM URL
 								# ================================================================================================
 								def test_auto_partition_from_url():
 								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
 								    elements = partition(url=url, content_type="text/plain", strategy=PartitionStrategy.HI_RES)
 								    assert elements[0] == Title("Apache License")
 								    assert elements[0].metadata.url == url
 								def test_auto_partition_from_url_with_rfc9110_content_type():
 								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
 								    elements = partition(
 								        url=url, content_type="text/plain; charset=utf-8", strategy=PartitionStrategy.HI_RES
 								    )
 								    assert elements[0] == Title("Apache License")
 								    assert elements[0].metadata.url == url
 								def test_auto_partition_from_url_without_providing_content_type():
 								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
 								    elements = partition(url=url, strategy=PartitionStrategy.HI_RES)
 								    assert elements[0] == Title("Apache License")
 								    assert elements[0].metadata.url == url
 								def test_auto_partition_warns_if_header_set_and_not_url(caplog: LogCaptureFixture):
 								    partition(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        example_doc_path("eml/fake-email.eml"),
 								        headers={"Accept": "application/pdf"},
 								        strategy=PartitionStrategy.HI_RES,
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    )
 								    assert caplog.records[0].levelname == "WARNING"
 								def test_partition_timeout_gets_routed():
 								    class CallException(Exception):
 								        pass
 								    mock_ocr_func = Mock(side_effect=CallException("Function called!"))
 								    with patch("unstructured.partition.auto.file_and_type_from_url", mock_ocr_func), pytest.raises(
 								        CallException
 								    ):
 								        auto.partition(url="fake_url", request_timeout=326)
 								    kwargs = mock_ocr_func.call_args.kwargs
 								    assert "request_timeout" in kwargs
 								    assert kwargs["request_timeout"] == 326
 								# ================================================================================================
 								# OTHER ARGS
 								# ================================================================================================
 								# -- chunking_strategy ----------------------------------------------------
-												chunk_by_title decorator (#1304)

### Summary

Partial solution to #1185.
Related to #1222.
Creates decorator from `chunk_by_title` cleaning brick.
Breaks a document into sections based on the presence of Title elements.
Also starts a new section under the following conditions:

- If metadata changes, indicating a change in section or page or a
switch to processing attachments. If `multipage_sections=True`, sections
can span pages. `multipage_sections` defaults to True.
- If the length of the section exceeds `new_after_n_chars` characters.
The default is 1500. The **chunking function does not split individual
elements**, so it's possible for a section to exceed that threshold if
an individual element if over `new_after_n_chars characters`, which
could occur with a long NarrativeText element.

Combines sections under these conditions
- Sections under `combine_under_n_chars` characters are combined. The
default is 500.

### Testing

from unstructured.partition.html import partition_html

url = "https://understandingwar.org/backgrounder/russian-offensive-campaign-assessment-august-27-2023-0"
chunks = partition_html(url=url, chunking_strategy="by_title")

for chunk in chunks:
    print(chunk)
    print("\n\n" + "-"*80)
    input()

											
										
										
											2023-09-11 16:00:14 -05:00
 								def test_add_chunking_strategy_on_partition_auto():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("example-10k-1p.html")
 								    elements = partition(file_path)
 								    chunk_elements = partition(file_path, chunking_strategy="by_title")
-												chunk_by_title decorator (#1304)

### Summary

Partial solution to #1185.
Related to #1222.
Creates decorator from `chunk_by_title` cleaning brick.
Breaks a document into sections based on the presence of Title elements.
Also starts a new section under the following conditions:

- If metadata changes, indicating a change in section or page or a
switch to processing attachments. If `multipage_sections=True`, sections
can span pages. `multipage_sections` defaults to True.
- If the length of the section exceeds `new_after_n_chars` characters.
The default is 1500. The **chunking function does not split individual
elements**, so it's possible for a section to exceed that threshold if
an individual element if over `new_after_n_chars characters`, which
could occur with a long NarrativeText element.

Combines sections under these conditions
- Sections under `combine_under_n_chars` characters are combined. The
default is 500.

### Testing

from unstructured.partition.html import partition_html

url = "https://understandingwar.org/backgrounder/russian-offensive-campaign-assessment-august-27-2023-0"
chunks = partition_html(url=url, chunking_strategy="by_title")

for chunk in chunks:
    print(chunk)
    print("\n\n" + "-"*80)
    input()

											
										
										
											2023-09-11 16:00:14 -05:00
+								    chunks = chunk_by_title(elements)
 								    assert chunk_elements != elements
 								    assert chunk_elements == chunks
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("example-10k-1p.html")
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
 								    # default chunk size in chars is 200
 								    partitioned_table_elements_200_chars = [
 								        e
 								        for e in partition(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								            file_path,
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								            chunking_strategy="by_title",
 								            max_characters=200,
 								            combine_text_under_n_chars=5,
 								        )
 								        if isinstance(e, (Table, TableChunk))
 								    ]
 								    partitioned_table_elements_5_chars = [
 								        e
 								        for e in partition(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								            file_path,
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								            chunking_strategy="by_title",
 								            max_characters=5,
 								            combine_text_under_n_chars=5,
 								        )
 								        if isinstance(e, (Table, TableChunk))
 								    ]
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(file_path)
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
 								    table_elements = [e for e in elements if isinstance(e, Table)]
 								    assert len(partitioned_table_elements_5_chars) != len(table_elements)
 								    assert len(partitioned_table_elements_200_chars) != len(table_elements)
-												feature(chunking): add basic strategy and overlap (#2367)

This PR culminates the restructuring of chunking over my prior
dozen-or-so commits by adding the new options to the API and
documentation.

Separately I'll be adding a new ingest test to defend against
regression, although the integration test included in this PR will do a
pretty good job of that too.
											
										
										
											2024-01-10 14:19:24 -08:00
+								    # trailing whitespace is stripped from the first chunk, leaving only a checkbox character
 								    assert len(partitioned_table_elements_5_chars[0].text) == 1
 								    # but the second chunk is the full 5 characters
-												rfctr(chunking): split oversized chunks on word boundary (#2297)

The text of an oversized chunk is split on an arbitrary character
boundary (mid-word). The `chunk_by_character()` strategy introduces the
idea of allowing the user to specify a separator to use for
chunk-splitting. For `langchain` this is typically "\n\n", "\n", or " ";
blank-line, newline, or word boundaries respectively.

Even if the user is allowed to specify a separator, we must provide
fall-back for when a chunk contains no such character. This can be done
incrementally, like blank-line is preferable to newline, newline is
preferable to word, and word is preferable to arbitrary character.

Further, there is nothing particular to `chunk_by_character()` in
providing such a fall-back text-splitting strategy. It would be
preferable for all strategies to split oversized chunks on even-word
boundaries for example.

Note that while a "blank-line" ("\n\n") may be common in plain text, it
is unlikely to appear in the text of an element because it would have
been interpreted as an element boundary during partitioning.

Add _TextSplitter with basic separator preferences and fall-back and
apply it to chunk-splitting for all strategies. The `by_character`
chunking strategy may enhance this behavior by adding the option for a
user to specify a particular separator suited to their use case.
											
										
										
											2023-12-20 21:45:36 -08:00
+								    assert len(partitioned_table_elements_5_chars[1].text) == 5
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert len(cast(str, partitioned_table_elements_5_chars[0].metadata.text_as_html)) == 5
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
 								    # the first table element is under 200 chars so doesn't get chunked!
 								    assert table_elements[0] == partitioned_table_elements_200_chars[0]
 								    assert len(partitioned_table_elements_200_chars[0].text) < 200
-												feature(chunking): add basic strategy and overlap (#2367)

This PR culminates the restructuring of chunking over my prior
dozen-or-so commits by adding the new options to the API and
documentation.

Separately I'll be adding a new ingest test to defend against
regression, although the integration test included in this PR will do a
pretty good job of that too.
											
										
										
											2024-01-10 14:19:24 -08:00
+								    assert len(partitioned_table_elements_200_chars[1].text) == 198
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert len(cast(str, partitioned_table_elements_200_chars[1].metadata.text_as_html)) == 200
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
 								def test_add_chunking_strategy_chars_on_partition_auto_adds_is_continuation():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("example-10k-1p.html")
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    table_elements = [e for e in partition(file_path) if isinstance(e, Table)]
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    table_chunks = [
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								        e
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        for e in partition(file_path, chunking_strategy="by_title")
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        if isinstance(e, (Table, TableChunk))
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								    ]
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert table_elements != table_chunks
-												chore: adding max_characters to other element type chunking (#1673)

This PR adds the `max_characters` (hard max) param to non-table element
chunking. Additionally updates the `num_characters` metadata to
`max_characters` to make it clearer which param we're referencing.

To test:

```
from unstructured.partition.html import partition_html

filename = "example-docs/example-10k-1p.html"
chunk_elements = partition_html(
        filename,
        chunking_strategy="by_title",
        combine_text_under_n_chars=0,
        new_after_n_chars=50,
        max_characters=100,
    )

for chunk in chunk_elements:
     print(len(chunk.text))

# previously we were only respecting the "soft max" (default of 500) for elements other than tables
# now we should see that all the elements have text fields under 100 chars.
```

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-09 12:42:36 -07:00
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								    i = 0
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    for chunk in table_chunks:
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								        # have to reset the counter to 0 here when we encounter a Table element
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        if not isinstance(chunk, TableChunk):
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								            i = 0
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        if i > 0 and isinstance(chunk, TableChunk):
 								            assert chunk.metadata.is_continuation is True
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								            i += 1
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# -- detect_language_per_element ------------------------------------------
 								def test_partition_respects_detect_language_per_element_arg():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(
 								        example_doc_path("language-docs/eng_spa_mult.txt"), detect_language_per_element=True
 								    )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    langs = [element.metadata.languages for element in elements]
 								    assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
 								# -- languages ------------------------------------------------------------
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
 								@pytest.mark.parametrize(
 								    "file_extension",
 								    [
 								        "doc",
 								        "docx",
 								        "eml",
 								        "epub",
 								        "html",
 								        "md",
 								        "odt",
 								        "org",
 								        "ppt",
 								        "pptx",
 								        "rst",
 								        "rtf",
 								        "txt",
 								        "xml",
 								    ],
 								)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_partition_respects_language_arg(file_extension: str):
 								    elements = partition(
 								        example_doc_path(f"language-docs/eng_spa_mult.{file_extension}"), languages=["deu"]
 								    )
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
+								    assert all(element.metadata.languages == ["deu"] for element in elements)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# -- include_page_breaks --------------------------------------------------
 								def test_auto_with_page_breaks():
 								    elements = partition(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        example_doc_path("layout-parser-paper-fast.pdf"),
 								        include_page_breaks=True,
 								        strategy=PartitionStrategy.HI_RES,
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    )
 								    assert "PageBreak" in [elem.category for elem in elements]
 								# -- metadata_filename ----------------------------------------------------
 								def test_auto_partition_metadata_filename():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("fake-text.txt")
 								    with open(file_path, "rb") as f:
 								        elements = partition(file=f, metadata_filename=file_path)
 								    assert elements[0].metadata.filename == os.path.split(file_path)[-1]
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								def test_auto_partition_warns_about_file_filename_deprecation(caplog: LogCaptureFixture):
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("fake-text.txt")
 								    with open(file_path, "rb") as f:
 								        elements = partition(file=f, file_filename=file_path)
 								    assert elements[0].metadata.filename == os.path.split(file_path)[-1]
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert "WARNING" in caplog.text
 								    assert "The file_filename kwarg will be deprecated" in caplog.text
 								def test_auto_partition_raises_with_file_and_metadata_filename():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("fake-text.txt")
 								    with open(file_path, "rb") as f, pytest.raises(ValueError):
 								        partition(file=f, file_filename=file_path, metadata_filename=file_path)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								# -- ocr_languages --------------------------------------------------------
 								def test_auto_partition_formats_languages_for_tesseract():
 								    with patch(
 								        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
 								    ) as mock_process_file_with_ocr:
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        partition(
 								            example_doc_path("chi_sim_image.jpeg"),
 								            strategy=PartitionStrategy.HI_RES,
 								            languages=["zh"],
 								        )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        _, kwargs = mock_process_file_with_ocr.call_args_list[0]
 								        assert "ocr_languages" in kwargs
 								        assert kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
 								@pytest.mark.parametrize(("languages", "ocr_languages"), [(["auto"], ""), (["eng"], "")])
 								def test_auto_partition_ignores_empty_string_for_ocr_languages(
 								    languages: list[str], ocr_languages: str
 								):
 								    elements = partition(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        example_doc_path("book-war-and-peace-1p.txt"),
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        strategy=PartitionStrategy.OCR_ONLY,
 								        ocr_languages=ocr_languages,
 								        languages=languages,
 								    )
 								    assert elements[0].metadata.languages == ["eng"]
 								def test_auto_partition_warns_with_ocr_languages(caplog: LogCaptureFixture):
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    partition(
 								        example_doc_path("chevron-page.pdf"), strategy=PartitionStrategy.HI_RES, ocr_languages="eng"
 								    )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert "The ocr_languages kwarg will be deprecated" in caplog.text
 								# -- skip_infer_table_types -----------------------------------------------
 								@pytest.mark.parametrize(
 								    ("skip_infer_table_types", "filename", "has_text_as_html_field"),
 								    [
 								        (["xlsx"], "stanley-cups.xlsx", False),
 								        ([], "stanley-cups.xlsx", True),
 								        (["odt"], "fake.odt", False),
 								        ([], "fake.odt", True),
 								    ],
 								)
 								def test_auto_partition_respects_skip_infer_table_types(
 								    skip_infer_table_types: list[str], filename: str, has_text_as_html_field: bool
 								):
 								    with open(example_doc_path(filename), "rb") as f:
 								        table_elements = [
 								            e
 								            for e in partition(file=f, skip_infer_table_types=skip_infer_table_types)
 								            if isinstance(e, Table)
 								        ]
 								        for table_element in table_elements:
 								            table_element_has_text_as_html_field = (
 								                hasattr(table_element.metadata, "text_as_html")
 								                and table_element.metadata.text_as_html is not None
 								            )
 								            assert table_element_has_text_as_html_field == has_text_as_html_field
 								# ================================================================================================
 								# METADATA BEHAVIORS
 								# ================================================================================================
 								# -- .filetype ------------------------------------------------------------
 								supported_filetypes = [t for t in FileType if t not in (FileType.UNK, FileType.ZIP, FileType.XLS)]
 								FILETYPE_TO_MODULE = {
 								    FileType.JPG: "image",
 								    FileType.PNG: "image",
 								    FileType.HEIC: "image",
 								    FileType.TXT: "text",
 								    FileType.EML: "email",
 								}
 								@pytest.mark.parametrize(
 								    ("content_type", "routing_func", "expected"),
 								    [
 								        ("text/csv", "csv", "text/csv"),
 								        ("text/html", "html", "text/html"),
 								        ("jdsfjdfsjkds", "pdf", None),
 								    ],
 								)
 								def test_auto_adds_filetype_to_metadata(
 								    request: FixtureRequest,
 								    content_type: str,
 								    routing_func: str,
 								    expected: str | None,
 								    monkeypatch: MonkeyPatch,
 								):
 								    partition_fn_ = function_mock(
 								        request,
 								        f"unstructured.partition.auto.partition_{routing_func}",
 								        return_value=[Text("text 1"), Text("text 2")],
 								    )
 								    mock_partition_with_extras_map = {routing_func: partition_fn_}
 								    monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
 								    elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
 								    assert len(elements) == 2
 								    assert all(el.metadata.filetype == expected for el in elements)
 								@pytest.mark.parametrize(
 								    ("content_type", "expected"),
 								    [
 								        ("application/pdf", FILETYPE_TO_MIMETYPE[FileType.PDF]),
 								        (None, FILETYPE_TO_MIMETYPE[FileType.PDF]),
 								    ],
 								)
 								def test_auto_filetype_overrides_file_specific(
 								    request: FixtureRequest, content_type: str | None, expected: str, monkeypatch: MonkeyPatch
 								):
 								    pdf_metadata = ElementMetadata(filetype="imapdf")
 								    partition_pdf_ = function_mock(
 								        request,
 								        "unstructured.partition.auto.partition_pdf",
 								        return_value=[Text("text 1", metadata=pdf_metadata), Text("text 2", metadata=pdf_metadata)],
 								    )
 								    mock_partition_with_extras_map = {"pdf": partition_pdf_}
 								    monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
 								    elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
 								    assert len(elements) == 2
 								    assert all(el.metadata.filetype == expected for el in elements)
 								@pytest.mark.parametrize("filetype", supported_filetypes)
 								def test_file_specific_produces_correct_filetype(filetype: FileType):
 								    if filetype in auto.IMAGE_FILETYPES or filetype in (FileType.WAV, FileType.EMPTY):
 								        pytest.skip()
 								    extension = filetype.name.lower()
 								    filetype_module = FILETYPE_TO_MODULE.get(filetype, extension)
 								    fun_name = "partition_" + filetype_module
 								    module = import_module(f"unstructured.partition.{filetype_module}")
 								    fun = getattr(module, fun_name)
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    for file in pathlib.Path(example_doc_path("")).iterdir():
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        if file.is_file() and file.suffix == f".{extension}":
 								            elements = fun(str(file))
 								            assert all(
 								                el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype]
 								                for el in elements
 								                if el.metadata.filetype is not None
 								            )
 								            break
 								# -- .languages -----------------------------------------------------------
 								def test_auto_partition_element_metadata_user_provided_languages():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(
 								        example_doc_path("chevron-page.pdf"),
 								        strategy=PartitionStrategy.OCR_ONLY,
 								        languages=["eng"],
 								    )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[0].metadata.languages == ["eng"]
 								def test_partition_languages_incorrectly_defaults_to_English(tmp_path: pathlib.Path):
 								    # -- We don't totally rely on langdetect for short text, so text like the following that is
 								    # -- in German will be labeled as English.
 								    german = "Ein kurzer Satz."
 								    filepath = str(tmp_path / "short-german.txt")
 								    with open(filepath, "w") as f:
 								        f.write(german)
 								    elements = partition(filepath)
 								    assert elements[0].metadata.languages == ["eng"]
 								def test_partition_languages_default_to_None():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(example_doc_path("handbook-1p.docx"), detect_language_per_element=True)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    # PageBreak and other elements with no text will have `None` for `languages`
 								    none_langs = [element for element in elements if element.metadata.languages is None]
 								    assert none_langs[0].text == ""
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
 								def test_partition_default_does_not_overwrite_other_defaults():
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    """`partition()` ["eng"] default does not overwrite ["auto"] default in other partitioners."""
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
+								    # the default for `languages` is ["auto"] in partiton_text
 								    from unstructured.partition.text import partition_text
 								    # Use a document that is primarily in a language other than English
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("language-docs/UDHR_first_article_all.txt")
 								    text_elements = partition_text(file_path)
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
+								    assert text_elements[0].metadata.languages != ["eng"]
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    auto_elements = partition(file_path)
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
+								    assert auto_elements[0].metadata.languages != ["eng"]
 								    assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages
-												fix: default to None for the languages metadata field (#1743)

### Summary
Closes #1714
Changes the default value for `languages` to `None` for elements that
don't have text or the language can't be detected.

### Testing
```
from unstructured.partition.auto import partition
filename = "example-docs/handbook-1p.docx"
elements = partition(filename=filename, detect_language_per_element=True)

# PageBreak elements don't have text and will be collected here
none_langs = [element for element in elements if element.metadata.languages is None]
none_langs[0].text
```

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-14 17:46:24 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# MISCELLANEOUS BEHAVIORS
 								# ================================================================================================
-												feat: enable request timeout (#2013)

Courtesy @cdpierse.

Adds a test to PR #1529 in accordance with feedback.

Description from original PR:

In python the default behaviour of `requests.get` without a `timeout`
being set is to hang indefinitely. We have a production use case where
the desired behaviour would be to raise a timeout error rather than have
the application just hang.

This PR adds a new optional keyword parameter `request_timeout` to
`partition` which is passed to `file_and_type_from_url` in the case
where we are fetching from a URL. This is then passed to `requests.get`

---------

Co-authored-by: Charles Pierse <charlespierse@gmail.com>
											
										
										
											2023-11-07 18:44:58 -06:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_works_on_empty_filename():
 								    assert partition(example_doc_path("empty.txt")) == []
-												enhancement: add support from bitmap images (#2414)

### Summary

Adds support for bitmap images (`.bmp`) in both file detection and
partitioning. Bitmap images will be processed with `partition_image`
just like JPGs and PNGs.

### Testing

```python
from unstructured.file_utils.filetype import detect_filetype
from unstructured.partition.auto import partition
from PIL import Image

filename = "example-docs/layout-parser-paper-with-table.jpg"
bmp_filename = "~/tmp/ayout-parser-paper-with-table.bmp"

img = Image.open(filename)
img.save(bmp_filename)

detect_filetype(filename=bmp_filename) # Should be FileType.BMP

elements = partition(filename=bmp_filename)
```
											
										
										
											2024-01-17 17:50:36 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_works_on_empty_file():
 								    with open(example_doc_path("empty.txt"), "rb") as f:
 								        assert partition(file=f) == []
-												enhancement: add support from bitmap images (#2414)

### Summary

Adds support for bitmap images (`.bmp`) in both file detection and
partitioning. Bitmap images will be processed with `partition_image`
just like JPGs and PNGs.

### Testing

```python
from unstructured.file_utils.filetype import detect_filetype
from unstructured.partition.auto import partition
from PIL import Image

filename = "example-docs/layout-parser-paper-with-table.jpg"
bmp_filename = "~/tmp/ayout-parser-paper-with-table.bmp"

img = Image.open(filename)
img.save(bmp_filename)

detect_filetype(filename=bmp_filename) # Should be FileType.BMP

elements = partition(filename=bmp_filename)
```
											
										
										
											2024-01-17 17:50:36 -05:00
-												enhancement: process `.p7s` files with `partition_email` (#2521)

### Summary

Closes #2489, which reported an inability to process `.p7s` files. PR
implements two changes:

- If the user selected content type for the email is not available and
there is another valid content type available, fall back to the other
valid content type.
- For signed message, extract the signature and add it to the metadata


### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/eml/signed-doc.p7s"
elements = partition(filename=filename) # should get a message about fall back logic
print(elements[0]) # "This is a test"
elements[0].metadata.to_dict() # Will see the signature
```
											
										
										
											2024-02-07 17:31:49 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_get_partition_with_extras_prompts_for_install_if_missing():
 								    partition_with_extras_map: dict[str, Callable[..., list[Element]]] = {}
 								    with pytest.raises(ImportError) as exception_info:
 								        _get_partition_with_extras("pdf", partition_with_extras_map)
-												enhancement: process `.p7s` files with `partition_email` (#2521)

### Summary

Closes #2489, which reported an inability to process `.p7s` files. PR
implements two changes:

- If the user selected content type for the email is not available and
there is another valid content type available, fall back to the other
valid content type.
- For signed message, extract the signature and add it to the metadata


### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/eml/signed-doc.p7s"
elements = partition(filename=filename) # should get a message about fall back logic
print(elements[0]) # "This is a test"
elements[0].metadata.to_dict() # Will see the signature
```
											
										
										
											2024-02-07 17:31:49 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    msg = str(exception_info.value)
 								    assert 'Install the pdf dependencies with pip install "unstructured[pdf]"' in msg