unstructured/test_unstructured/partition/test_auto.py

# pyright: reportPrivateUsage=false

from __future__ import annotations

import json
import os
import pathlib
import tempfile
import warnings
from importlib import import_module
from typing import Iterator
from unittest.mock import MagicMock, patch

import pytest
from PIL import Image

from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
from test_unstructured.partition.test_constants import (
    EXPECTED_TABLE,
    EXPECTED_TABLE_XLSX,
    EXPECTED_TEXT,
    EXPECTED_XLS_TABLE,
)
from test_unstructured.unit_utils import (
    ANY,
    FixtureRequest,
    LogCaptureFixture,
    example_doc_path,
    function_mock,
    method_mock,
)
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import (
    Address,
    CompositeElement,
    Element,
    ElementMetadata,
    ListItem,
    NarrativeText,
    Table,
    TableChunk,
    Text,
    Title,
)
from unstructured.file_utils.filetype import detect_filetype
from unstructured.file_utils.model import FileType, create_file_type, register_partitioner
from unstructured.partition.auto import _PartitionerLoader, partition
from unstructured.partition.common import UnsupportedFileFormatError
from unstructured.partition.utils.constants import PartitionStrategy
from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json

is_in_docker = os.path.exists("/.dockerenv")


# ================================================================================================
# CSV
# ================================================================================================


def test_auto_partition_csv_from_filename():
    elements = partition(example_doc_path("stanley-cups.csv"))

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == "text/csv"


def test_auto_partition_csv_from_file():
    with open(example_doc_path("stanley-cups.csv"), "rb") as f:
        elements = partition(file=f)

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert isinstance(elements[0], Table)
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == "text/csv"


# ================================================================================================
# DOC
# ================================================================================================


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
)
def test_auto_partition_doc_from_filename(
    pass_metadata_filename: bool, content_type: str | None, expected_docx_elements: list[Element]
):
    file_path = example_doc_path("simple.doc")
    metadata_filename = file_path if pass_metadata_filename else None

    elements = partition(
        filename=file_path,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.HI_RES,
    )

    for e in elements:
        print(f"{type(e).__name__}({repr(e.text)})")

    assert elements == expected_docx_elements
    assert all(e.metadata.filename == "simple.doc" for e in elements)
    assert all(e.metadata.file_directory == example_doc_path("") for e in elements)


@pytest.mark.skipif(is_in_docker, reason="Passes in CI but not Docker. Remove skip on #3364 fix.")
def test_auto_partition_doc_from_file(expected_docx_elements: list[Element]):
    with open(example_doc_path("simple.doc"), "rb") as f:
        elements = partition(file=f)

    assert elements == expected_docx_elements


# ================================================================================================
# DOCX
# ================================================================================================


def test_auto_partition_docx_from_filename(expected_docx_elements: list[Element]):
    elements = partition(example_doc_path("simple.docx"), strategy=PartitionStrategy.HI_RES)

    assert elements == expected_docx_elements
    assert all(e.metadata.filename == "simple.docx" for e in elements)


def test_auto_partition_docx_from_file(expected_docx_elements: list[Element]):
    with open(example_doc_path("simple.docx"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert elements == expected_docx_elements


@pytest.mark.parametrize("file_name", ["simple.docx", "simple.doc", "simple.odt"])
@pytest.mark.parametrize(
    "strategy",
    [
        PartitionStrategy.AUTO,
        PartitionStrategy.FAST,
        PartitionStrategy.HI_RES,
        PartitionStrategy.OCR_ONLY,
    ],
)
def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers(
    request: FixtureRequest, file_name: str, strategy: str
):
    """The `strategy` arg value received by `partition()` is received by `partition_docx().

    To do this in the brokering-partitioner cases (DOC, ODT) it must make its way to
    `partition_doc()` or `partition_odt()` which must then forward it to `partition_docx()`. This
    test makes sure it made it all the way.

    Note this is 3 file-types X 4 strategies = 12 test-cases.
    """
    from unstructured.partition.docx import _DocxPartitioner

    def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
        yield Text(f"strategy=={self._opts.strategy}")

    _iter_elements_ = method_mock(
        request,
        _DocxPartitioner,
        "_iter_document_elements",
        side_effect=fake_iter_document_elements,
    )

    (element,) = partition(example_doc_path(file_name), strategy=strategy)

    _iter_elements_.assert_called_once_with(ANY)
    assert element.text == f"strategy=={strategy}"


# ================================================================================================
# EML
# ================================================================================================

EXPECTED_EMAIL_OUTPUT = [
    NarrativeText(text="This is a test email to use for unit tests."),
    Text(text="Important points:"),
    ListItem(text="Roses are red"),
    ListItem(text="Violets are blue"),
]


def test_auto_partition_email_from_filename():
    file_path = example_doc_path("eml/fake-email.eml")

    elements = partition(file_path, strategy=PartitionStrategy.HI_RES)

    assert len(elements) > 0
    assert elements == EXPECTED_EMAIL_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(file_path)
    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]


def test_auto_partition_email_from_file():
    with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)

    assert len(elements) > 0
    assert elements == EXPECTED_EMAIL_OUTPUT


# ================================================================================================
# EPUB
# ================================================================================================


def test_auto_partition_epub_from_filename():
    elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES)

    assert len(elements) > 0
    assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")


def test_auto_partition_epub_from_file():
    with open(example_doc_path("winter-sports.epub"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)

    assert len(elements) > 0
    assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")


# ================================================================================================
# HTML
# ================================================================================================


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content_type: str | None):
    file_path = example_doc_path("example-10k-1p.html")
    metadata_filename = file_path if pass_metadata_filename else None

    elements = partition(
        filename=file_path,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.HI_RES,
    )

    assert elements
    expected_filename, expected_directory = os.path.basename(file_path), os.path.split(file_path)[0]
    assert all(e.metadata.filename == expected_filename for e in elements)
    assert all(e.metadata.file_directory == expected_directory for e in elements)


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_type: str | None):
    file_path = example_doc_path("example-10k-1p.html")
    metadata_filename = file_path if pass_metadata_filename else None

    with open(file_path, "rb") as f:
        elements = partition(
            file=f,
            metadata_filename=metadata_filename,
            content_type=content_type,
            strategy=PartitionStrategy.HI_RES,
        )

    assert len(elements) > 0


def test_auto_partition_html_pre_from_file():
    elements = partition(example_doc_path("fake-html-pre.htm"))

    assert len(elements) > 0
    assert "PageBreak" not in [elem.category for elem in elements]
    assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
    assert isinstance(elements[0], NarrativeText)
    assert all(e.metadata.filetype == "text/html" for e in elements)
    assert all(e.metadata.filename == "fake-html-pre.htm" for e in elements)


# ================================================================================================
# IMAGE
# ================================================================================================


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpeg_from_filename(pass_metadata_filename: bool, content_type: str | None):
    file_path = example_doc_path("img/layout-parser-paper-fast.jpg")
    metadata_filename = file_path if pass_metadata_filename else None

    elements = partition(
        filename=file_path,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.AUTO,
    )

    e = elements[2]
    assert e.text == (
        "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
    )
    assert e.metadata.coordinates is not None


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpeg_from_file(pass_metadata_filename: bool, content_type: str | None):
    file_path = example_doc_path("img/layout-parser-paper-fast.jpg")
    metadata_filename = file_path if pass_metadata_filename else None

    with open(file_path, "rb") as f:
        elements = partition(
            file=f,
            metadata_filename=metadata_filename,
            content_type=content_type,
            strategy=PartitionStrategy.AUTO,
        )

    e = elements[2]
    assert e.text == (
        "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
    )
    assert e.metadata.coordinates is not None


def test_auto_partition_bmp_from_filename(tmp_path: pathlib.Path):
    bmp_filename = str(tmp_path / "example.bmp")
    with Image.open(example_doc_path("img/layout-parser-paper-with-table.jpg")) as img:
        img.save(bmp_filename)

    elements = partition(filename=bmp_filename, strategy=PartitionStrategy.HI_RES)

    table = [e.metadata.text_as_html for e in elements if e.metadata.text_as_html]
    assert len(table) == 1
    assert "<table><thead><tr>" in table[0]
    assert "</thead><tbody><tr>" in table[0]


@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_auto_partition_image_element_extraction(extract_image_block_to_payload: bool):
    extract_image_block_types = ["Image", "Table"]

    with tempfile.TemporaryDirectory() as tmpdir:
        elements = partition(
            filename=example_doc_path("img/embedded-images-tables.jpg"),
            extract_image_block_types=extract_image_block_types,
            extract_image_block_to_payload=extract_image_block_to_payload,
            extract_image_block_output_dir=tmpdir,
        )

        assert_element_extraction(
            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
        )


# ================================================================================================
# JSON
# ================================================================================================


# TODO(scanny): This test should go away when we fix #3365. This test glosses over several
# important JSON "rehydration" behaviors, in particular that the metadata should match exactly.
# The following test `test_auto_partition_json_from_file_preserves_original_elements` will be the
# replacement for this test.
def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements():
    """Test auto-processing an unstructured json output file by filename."""
    json_file_path = example_doc_path("spring-weather.html.json")
    original_file_name = "spring-weather.html"
    with open(json_file_path) as json_f:
        expected_result = json.load(json_f)

    partitioning_result = json.loads(
        elements_to_json(
            partition(
                filename=str(json_file_path),
                # -- use the original file name to get the same element IDs (hashes) --
                metadata_filename=original_file_name,
                strategy=PartitionStrategy.HI_RES,
            )
        )
    )
    for elem in partitioning_result:
        elem.pop("metadata")
    for elem in expected_result:
        elem.pop("metadata")

    assert expected_result == partitioning_result


@pytest.mark.xfail(
    reason=(
        "https://github.com/Unstructured-IO/unstructured/issues/3365"
        " partition_json() does not preserve original element-id or metadata"
    ),
    raises=AssertionError,
    strict=True,
)
def test_auto_partition_json_from_file_preserves_original_elements():
    file_path = example_doc_path("simple.json")
    original_elements = elements_from_json(file_path)

    with open(file_path, "rb") as f:
        partitioned_elements = partition(file=f)

    assert elements_to_dicts(partitioned_elements) == elements_to_dicts(original_elements)


def test_auto_partition_processes_simple_ndjson(tmp_path: pathlib.Path):
    text = '{"text": "hello", "type": "NarrativeText"}'

    file_path = str(tmp_path / "unprocessable.json")
    with open(file_path, "w") as f:
        f.write(text)

    result = partition(filename=file_path)
    assert len(result) == 1
    assert isinstance(result[0], NarrativeText)
    assert "hello" in result[0].text


# ================================================================================================
# MD
# ================================================================================================


def test_partition_md_from_url_works_with_embedded_html():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
    elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)
    assert "unstructured" in elements[1].text


# ================================================================================================
# MSG
# ================================================================================================


def test_auto_partition_msg_from_filename():
    assert partition(example_doc_path("fake-email.msg"), strategy=PartitionStrategy.HI_RES) == [
        NarrativeText(text="This is a test email to use for unit tests."),
        Text(text="Important points:"),
        ListItem(text="Roses are red"),
        ListItem(text="Violets are blue"),
    ]


# ================================================================================================
# ODT
# ================================================================================================


def test_auto_partition_odt_from_filename(expected_docx_elements: list[Element]):
    elements = partition(example_doc_path("simple.odt"), strategy=PartitionStrategy.HI_RES)
    assert elements == expected_docx_elements


def test_auto_partition_odt_from_file(expected_docx_elements: list[Element]):
    with open(example_doc_path("simple.odt"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)

    assert elements == expected_docx_elements


# ================================================================================================
# ORG
# ================================================================================================


def test_auto_partition_org_from_filename():
    elements = partition(example_doc_path("README.org"))

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/org"


def test_auto_partition_org_from_file():
    with open(example_doc_path("README.org"), "rb") as f:
        elements = partition(file=f, content_type="text/org")

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/org"


# ================================================================================================
# PDF
# ================================================================================================


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
    file_path = example_doc_path("pdf/chevron-page.pdf")
    metadata_filename = file_path if pass_metadata_filename else None

    elements = partition(
        filename=file_path,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.HI_RES,
    )

    e = elements[0]
    assert isinstance(e, Title)
    assert e.text.startswith("eastern mediterranean")
    assert e.metadata.filename == os.path.basename(file_path)
    assert e.metadata.file_directory == os.path.split(file_path)[0]

    e = elements[1]
    assert isinstance(e, NarrativeText)
    assert e.text.startswith("We’re investing")


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
    file_path = example_doc_path("pdf/chevron-page.pdf")
    metadata_filename = file_path if pass_metadata_filename else None

    with open(file_path, "rb") as f:
        elements = partition(
            file=f,
            metadata_filename=metadata_filename,
            content_type=content_type,
            strategy=PartitionStrategy.HI_RES,
        )

    e = elements[0]
    assert isinstance(e, Title)
    assert e.text.startswith("eastern mediterranean")

    e = elements[1]
    assert isinstance(e, NarrativeText)
    assert e.text.startswith("We’re investing")


def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
    partition_pdf_ = function_mock(
        request,
        "unstructured.partition.pdf.partition_pdf",
        return_value=[NarrativeText("Hello there!")],
    )
    partitioner_loader_get_ = method_mock(
        request, _PartitionerLoader, "get", return_value=partition_pdf_
    )
    file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")

    partition(file_path, strategy=PartitionStrategy.FAST)

    partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
    partition_pdf_.assert_called_once_with(
        filename=file_path,
        file=None,
        url=None,
        strategy=PartitionStrategy.FAST,
        languages=None,
        metadata_filename=None,
        infer_table_structure=False,
        extract_images_in_pdf=False,
        extract_image_block_types=None,
        extract_image_block_output_dir=None,
        extract_image_block_to_payload=False,
        hi_res_model_name=None,
        starting_page_number=1,
    )


@pytest.mark.parametrize("infer_bool", [True, False])
def test_auto_handles_kwarg_with_infer_table_structure(infer_bool):
    with patch(
        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
    ) as mock_process_file_with_model:
        partition(
            example_doc_path("pdf/layout-parser-paper-fast.pdf"),
            pdf_infer_table_structure=True,
            strategy=PartitionStrategy.HI_RES,
            infer_table_structure=infer_bool,
        )
        assert mock_process_file_with_model.call_args[1]["infer_table_structure"] is infer_bool


def test_auto_handles_kwarg_with_infer_table_structure_when_none():
    with patch(
        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
    ) as mock_process_file_with_model:
        partition(
            example_doc_path("pdf/layout-parser-paper-fast.pdf"),
            pdf_infer_table_structure=True,
            strategy=PartitionStrategy.HI_RES,
            infer_table_structure=None,
        )
        assert mock_process_file_with_model.call_args[1]["infer_table_structure"] is True


def test_auto_partition_pdf_uses_pdf_infer_table_structure_argument():
    with patch(
        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
    ) as mock_process_file_with_model:
        partition(
            example_doc_path("pdf/layout-parser-paper-fast.pdf"),
            pdf_infer_table_structure=True,
            strategy=PartitionStrategy.HI_RES,
        )
        assert mock_process_file_with_model.call_args[1]["infer_table_structure"]


@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: bool):
    extract_image_block_types = ["Image", "Table"]

    with tempfile.TemporaryDirectory() as tmpdir:
        elements = partition(
            example_doc_path("pdf/embedded-images-tables.pdf"),
            extract_image_block_types=extract_image_block_types,
            extract_image_block_to_payload=extract_image_block_to_payload,
            extract_image_block_output_dir=tmpdir,
        )

        assert_element_extraction(
            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
        )


def test_auto_partition_html_element_extraction():
    extract_image_block_types = ["Image"]

    with tempfile.TemporaryDirectory() as tmpdir:
        elements = partition(
            example_doc_path("fake-html-with-base64-image.html"),
            extract_image_block_types=extract_image_block_types,
            extract_image_block_to_payload=True,
        )

        assert_element_extraction(elements, extract_image_block_types, True, tmpdir)


def test_auto_partition_html_image_with_url():
    elements = partition(
        example_doc_path("fake-html-with-image-from-url.html"),
    )
    assert elements[1].metadata.image_url is not None


def test_partition_pdf_does_not_raise_warning():
    # NOTE(robinson): This is the recommended way to check that no warning is emitted,
    # per the pytest docs.
    # ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
    #      #additional-use-cases-of-warnings-in-tests
    with warnings.catch_warnings():
        warnings.simplefilter("error")
        partition(
            example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES
        )


# ================================================================================================
# PPT
# ================================================================================================


def test_auto_partition_ppt_from_filename():
    file_path = example_doc_path("fake-power-point.ppt")

    elements = partition(file_path, strategy=PartitionStrategy.HI_RES)

    assert elements == [
        Title(text="Adding a Bullet Slide"),
        ListItem(text="Find the bullet slide layout"),
        ListItem(text="Use _TextFrame.text for first bullet"),
        ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
        NarrativeText(text="Here is a lot of text!"),
        NarrativeText(text="Here is some text in a text box!"),
    ]
    assert all(e.metadata.filename == "fake-power-point.ppt" for e in elements)
    assert all(e.metadata.file_directory == example_doc_path("") for e in elements)


# ================================================================================================
# PPTX
# ================================================================================================


def test_auto_partition_pptx_from_filename():
    file_path = example_doc_path("fake-power-point.pptx")

    elements = partition(file_path, strategy=PartitionStrategy.HI_RES)

    assert elements == [
        Title(text="Adding a Bullet Slide"),
        ListItem(text="Find the bullet slide layout"),
        ListItem(text="Use _TextFrame.text for first bullet"),
        ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
        NarrativeText(text="Here is a lot of text!"),
        NarrativeText(text="Here is some text in a text box!"),
    ]
    assert all(e.metadata.filename == "fake-power-point.pptx" for e in elements)
    assert all(e.metadata.file_directory == example_doc_path("") for e in elements)


@pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"])
@pytest.mark.parametrize(
    "strategy",
    [
        PartitionStrategy.AUTO,
        PartitionStrategy.FAST,
        PartitionStrategy.HI_RES,
        PartitionStrategy.OCR_ONLY,
    ],
)
def test_partition_forwards_strategy_arg_to_partition_pptx_and_its_brokers(
    request: FixtureRequest, file_name: str, strategy: str
):
    """The `strategy` arg value received by `partition()` is received by `partition_pptx().

    To do this in the brokering-partitioner case (PPT) the strategy argument must make its way to
    `partition_ppt()` which must then forward it to `partition_pptx()`. This test makes sure it
    made it all the way.

    Note this is 2 file-types X 4 strategies = 8 test-cases.
    """
    from unstructured.partition.pptx import _PptxPartitioner

    def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
        yield Text(f"strategy=={self._opts.strategy}")

    _iter_elements_ = method_mock(
        request,
        _PptxPartitioner,
        "_iter_presentation_elements",
        side_effect=fake_iter_presentation_elements,
    )

    (element,) = partition(example_doc_path(file_name), strategy=strategy)

    _iter_elements_.assert_called_once_with(ANY)
    assert element.text == f"strategy=={strategy}"


# ================================================================================================
# RST
# ================================================================================================


def test_auto_partition_rst_from_filename():
    elements = partition(example_doc_path("README.rst"))

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/x-rst"


def test_auto_partition_rst_from_file():
    with open(example_doc_path("README.rst"), "rb") as f:
        elements = partition(file=f, content_type="text/x-rst")

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/x-rst"


# ================================================================================================
# RTF
# ================================================================================================


def test_auto_partition_rtf_from_filename():
    elements = partition(example_doc_path("fake-doc.rtf"), strategy=PartitionStrategy.HI_RES)
    assert elements[0] == Title("My First Heading")


# ================================================================================================
# TSV
# ================================================================================================


def test_auto_partition_tsv_from_filename():
    elements = partition(example_doc_path("stanley-cups.tsv"))

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == "text/tsv"


# ================================================================================================
# TXT
# ================================================================================================
@pytest.mark.parametrize(
    ("filename", "expected_elements"),
    [
        (
            "fake-text.txt",
            [
                NarrativeText(text="This is a test document to use for unit tests."),
                Address(text="Doylestown, PA 18901"),
                Title(text="Important points:"),
                ListItem(text="Hamburgers are delicious"),
                ListItem(text="Dogs are the best"),
                ListItem(text="I love fuzzy blankets"),
            ],
        ),
        ("fake-text-all-whitespace.txt", []),
    ],
)
def test_auto_partition_text_from_filename(filename: str, expected_elements: list[Element]):
    file_path = example_doc_path(filename)

    elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)

    assert elements == expected_elements
    assert all(e.metadata.filename == filename for e in elements)
    assert all(e.metadata.file_directory == example_doc_path("") for e in elements)


def test_auto_partition_text_from_file():
    with open(example_doc_path("fake-text.txt"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)

    assert len(elements) > 0
    assert elements == [
        NarrativeText(text="This is a test document to use for unit tests."),
        Address(text="Doylestown, PA 18901"),
        Title(text="Important points:"),
        ListItem(text="Hamburgers are delicious"),
        ListItem(text="Dogs are the best"),
        ListItem(text="I love fuzzy blankets"),
    ]


# ================================================================================================
# XLS
# ================================================================================================


def test_auto_partition_xls_from_filename():
    elements = partition(
        example_doc_path("tests-example.xls"), include_header=False, skip_infer_table_types=[]
    )

    assert len(elements) == 14
    assert sum(isinstance(e, Table) for e in elements) == 2
    assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE
    assert len(elements[0].text) == 507


# ================================================================================================
# XLSX
# ================================================================================================


def test_auto_partition_xlsx_from_filename():
    elements = partition(
        example_doc_path("stanley-cups.xlsx"), include_header=False, skip_infer_table_types=[]
    )

    assert len(elements) == 4
    assert sum(isinstance(e, Table) for e in elements) == 2
    assert sum(isinstance(e, Title) for e in elements) == 2
    assert clean_extra_whitespace(elements[0].text) == "Stanley Cups"
    assert clean_extra_whitespace(elements[1].text) == (
        "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
    )
    assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
    assert all(e.metadata.page_number == 1 for e in elements[:2])
    assert all(e.metadata.page_number == 2 for e in elements[2:])
    assert all(
        e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        for e in elements
    )


def test_auto_partition_xlsx_from_file():
    with open(example_doc_path("stanley-cups.xlsx"), "rb") as f:
        elements = partition(file=f, include_header=False, skip_infer_table_types=[])

    assert len(elements) == 4
    assert sum(isinstance(element, Table) for element in elements) == 2
    assert sum(isinstance(element, Title) for element in elements) == 2
    assert clean_extra_whitespace(elements[0].text) == "Stanley Cups"
    assert clean_extra_whitespace(elements[1].text) == (
        "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
    )
    assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
    assert all(e.metadata.page_number == 1 for e in elements[:2])
    assert all(e.metadata.page_number == 2 for e in elements[2:])
    assert all(
        e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        for e in elements
    )


def test_auto_partition_xlsx_respects_starting_page_number_argument():
    elements = partition(example_doc_path("stanley-cups.xlsx"), starting_page_number=3)
    assert all(e.metadata.page_number == 3 for e in elements[:2])
    assert all(e.metadata.page_number == 4 for e in elements[2:])


# ================================================================================================
# XML
# ================================================================================================


def test_auto_partition_xml_from_filename():
    elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=False)

    assert elements[0].text == "United States"
    assert all(e.metadata.filename == "factbook.xml" for e in elements)


def test_auto_partition_xml_from_file():
    with open(example_doc_path("factbook.xml"), "rb") as f:
        elements = partition(file=f, xml_keep_tags=False)

    assert elements[0].text == "United States"


def test_auto_partition_xml_from_filename_with_tags():
    elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=True)

    assert "<leader>Joe Biden</leader>" in elements[0].text
    assert elements[0].metadata.filename == "factbook.xml"


def test_auto_partition_xml_from_file_with_tags():
    with open(example_doc_path("factbook.xml"), "rb") as f:
        elements = partition(file=f, xml_keep_tags=True)

    assert "<leader>Joe Biden</leader>" in elements[0].text


# ================================================================================================
# FILE_TYPE NOT RECOGNIZED OR NOT SUPPORTED
# ================================================================================================


def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
    detect_filetype_ = function_mock(
        request, "unstructured.partition.auto.detect_filetype", return_value=FileType.UNK
    )

    with pytest.raises(
        UnsupportedFileFormatError,
        match="Partitioning is not supported for the FileType.UNK file type.",
    ):
        partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)

    detect_filetype_.assert_called_once_with(
        file_path="made-up.fake",
        file=None,
        encoding=None,
        content_type=None,
        metadata_file_path=None,
    )


# ================================================================================================
# LOAD FROM URL
# ================================================================================================


def test_auto_partition_from_url():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"

    elements = partition(url=url, content_type="text/plain", strategy=PartitionStrategy.HI_RES)

    assert elements[0] == Title("Apache License")
    assert all(e.metadata.url == url for e in elements)


def test_auto_partition_from_url_with_rfc9110_content_type():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"

    elements = partition(
        url=url, content_type="text/plain; charset=utf-8", strategy=PartitionStrategy.HI_RES
    )

    assert elements[0] == Title("Apache License")
    assert all(e.metadata.url == url for e in elements)


def test_auto_partition_from_url_without_providing_content_type():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"

    elements = partition(url=url, strategy=PartitionStrategy.HI_RES)

    assert elements[0] == Title("Apache License")
    assert all(e.metadata.url == url for e in elements)


def test_auto_partition_warns_if_header_set_and_not_url(caplog: LogCaptureFixture):
    partition(
        example_doc_path("eml/fake-email.eml"),
        headers={"Accept": "application/pdf"},
        strategy=PartitionStrategy.HI_RES,
    )

    assert caplog.records[0].levelname == "WARNING"
    assert "headers kwarg is set but the url kwarg is not. The headers kwarg will b" in caplog.text


def test_auto_partition_from_url_routes_timeout_to_HTTP_request(request: FixtureRequest):
    file_and_type_from_url_ = function_mock(
        request,
        "unstructured.partition.auto.file_and_type_from_url",
        side_effect=ConnectionError("Trouble on the wire ..."),
    )

    with pytest.raises(ConnectionError, match="Trouble on the wire ..."):
        partition(url="http://eie.io", request_timeout=326)

    file_and_type_from_url_.assert_called_once_with(
        url="http://eie.io", content_type=None, headers={}, ssl_verify=True, request_timeout=326
    )


# ================================================================================================
# OTHER ARGS
# ================================================================================================

# -- chunking_strategy ----------------------------------------------------


def test_auto_partition_forwards_chunking_strategy_via_kwargs():
    chunks = partition(example_doc_path("example-10k-1p.html"), chunking_strategy="by_title")
    assert all(isinstance(chunk, (CompositeElement, Table, TableChunk)) for chunk in chunks)


def test_auto_partition_forwards_max_characters_via_kwargs():
    chunks = partition(
        example_doc_path("example-10k-1p.html"),
        chunking_strategy="by_title",
        max_characters=250,
    )
    assert all(len(chunk.text) <= 250 for chunk in chunks)


# -- detect_language_per_element ------------------------------------------


def test_auto_partition_respects_detect_language_per_element_arg():
    elements = partition(
        example_doc_path("language-docs/eng_spa_mult.txt"), detect_language_per_element=True
    )
    langs = [element.metadata.languages for element in elements]
    assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]


# -- languages ------------------------------------------------------------


@pytest.mark.parametrize(
    "file_extension",
    [
        "doc",
        "docx",
        "eml",
        "epub",
        "html",
        "md",
        "odt",
        "org",
        "ppt",
        "pptx",
        "rst",
        "rtf",
        "txt",
        "xml",
    ],
)
def test_auto_partition_respects_language_arg(file_extension: str):
    elements = partition(
        example_doc_path(f"language-docs/eng_spa_mult.{file_extension}"), languages=["deu"]
    )
    assert all(element.metadata.languages == ["deu"] for element in elements)


# -- include_page_breaks --------------------------------------------------


def test_auto_partition_forwards_include_page_breaks_to_partition_pdf():
    elements = partition(
        example_doc_path("pdf/layout-parser-paper-fast.pdf"),
        include_page_breaks=True,
        strategy=PartitionStrategy.HI_RES,
    )
    assert "PageBreak" in [elem.category for elem in elements]


# -- metadata_filename ----------------------------------------------------


def test_auto_partition_forwards_metadata_filename_via_kwargs():
    with open(example_doc_path("fake-text.txt"), "rb") as f:
        elements = partition(file=f, metadata_filename="much-more-interesting-name.txt")

    assert all(e.metadata.filename == "much-more-interesting-name.txt" for e in elements)


# -- ocr_languages --------------------------------------------------------


def test_auto_partition_image_formats_languages_for_tesseract(request: FixtureRequest):
    process_file_with_ocr_ = function_mock(
        request, "unstructured.partition.pdf_image.ocr.process_file_with_ocr"
    )

    partition(
        example_doc_path("img/chi_sim_image.jpeg"),
        strategy=PartitionStrategy.HI_RES,
        languages=["zh"],
    )

    call_kwargs = process_file_with_ocr_.call_args_list[0][1]
    assert call_kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"


@pytest.mark.parametrize(("languages", "ocr_languages"), [(["auto"], ""), (["eng"], "")])
def test_auto_partition_ignores_empty_string_for_ocr_languages(
    languages: list[str], ocr_languages: str
):
    elements = partition(
        example_doc_path("book-war-and-peace-1p.txt"),
        strategy=PartitionStrategy.OCR_ONLY,
        ocr_languages=ocr_languages,
        languages=languages,
    )
    assert all(e.metadata.languages == ["eng"] for e in elements)


def test_auto_partition_warns_with_ocr_languages(caplog: LogCaptureFixture):
    partition(
        example_doc_path("pdf/chevron-page.pdf"),
        strategy=PartitionStrategy.HI_RES,
        ocr_languages="eng",
    )

    assert caplog.records[0].levelname == "WARNING"
    assert "The ocr_languages kwarg will be deprecated" in caplog.text


# -- skip_infer_table_types -----------------------------------------------


@pytest.mark.parametrize(
    ("skip_infer_table_types", "filename", "has_text_as_html"),
    [
        (["xlsx"], "stanley-cups.xlsx", False),
        ([], "stanley-cups.xlsx", True),
        (["odt"], "fake.odt", False),
        ([], "fake.odt", True),
    ],
)
def test_auto_partition_respects_skip_infer_table_types(
    skip_infer_table_types: list[str], filename: str, has_text_as_html: bool
):
    with open(example_doc_path(filename), "rb") as f:
        elements = partition(file=f, skip_infer_table_types=skip_infer_table_types)

    table_elements = [e for e in elements if isinstance(e, Table)]
    assert table_elements
    for e in table_elements:
        assert (e.metadata.text_as_html is not None) == has_text_as_html


# ================================================================================================
# METADATA BEHAVIORS
# ================================================================================================

# -- .filetype ------------------------------------------------------------


@pytest.mark.parametrize(
    ("content_type", "shortname", "expected_value"),
    [
        ("text/csv", "csv", "text/csv"),
        ("text/html", "html", "text/html"),
        ("jdsfjdfsjkds", "pdf", None),
    ],
)
def test_auto_partition_adds_filetype_to_metadata(
    request: FixtureRequest,
    content_type: str,
    shortname: str,
    expected_value: str | None,
):
    partition_fn_ = function_mock(
        request,
        f"unstructured.partition.{shortname}.partition_{shortname}",
        return_value=[Text("text 1"), Text("text 2")],
    )
    partitioner_loader_get_ = method_mock(
        request, _PartitionerLoader, "get", return_value=partition_fn_
    )

    elements = partition(
        example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
    )

    partitioner_loader_get_.assert_called_once()
    assert len(elements) == 2
    assert all(e.metadata.filetype == expected_value for e in elements)


@pytest.mark.parametrize(
    "content_type",
    [
        # -- content-type provided as argument --
        "application/pdf",
        # -- auto-detected content-type --
        None,
    ],
)
def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partitioner(
    request: FixtureRequest, content_type: str | None
):
    metadata = ElementMetadata(filetype="imapdf")
    partition_pdf_ = function_mock(
        request,
        "unstructured.partition.pdf.partition_pdf",
        return_value=[Text("text 1", metadata=metadata), Text("text 2", metadata=metadata)],
    )
    partitioner_loader_get_ = method_mock(
        request, _PartitionerLoader, "get", return_value=partition_pdf_
    )

    elements = partition(
        example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
    )

    partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
    assert len(elements) == 2
    assert all(e.metadata.filetype == "application/pdf" for e in elements)


@pytest.mark.parametrize(
    ("file_name", "file_type"),
    [
        ("stanley-cups.csv", FileType.CSV),
        ("simple.doc", FileType.DOC),
        ("simple.docx", FileType.DOCX),
        ("fake-email.eml", FileType.EML),
        ("simple.epub", FileType.EPUB),
        ("fake-html.html", FileType.HTML),
        ("README.md", FileType.MD),
        ("fake-email.msg", FileType.MSG),
        ("simple.odt", FileType.ODT),
        ("pdf/DA-1p.pdf", FileType.PDF),
        ("fake-power-point.ppt", FileType.PPT),
        ("simple.pptx", FileType.PPTX),
        ("README.rst", FileType.RST),
        ("fake-doc.rtf", FileType.RTF),
        ("stanley-cups.tsv", FileType.TSV),
        ("fake-text.txt", FileType.TXT),
        ("tests-example.xls", FileType.XLSX),
        ("stanley-cups.xlsx", FileType.XLSX),
        ("factbook.xml", FileType.XML),
    ],
)
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(
    file_name: str, file_type: FileType
):
    file_path = example_doc_path(file_name)
    partition_fn_name = file_type.partitioner_function_name
    module = import_module(file_type.partitioner_module_qname)
    partition_fn = getattr(module, partition_fn_name)

    # -- partition the example-doc for this filetype --
    elements = partition_fn(file_path, process_attachments=False)

    assert elements
    assert all(
        e.metadata.filetype == file_type.mime_type
        for e in elements
        if e.metadata.filetype is not None
    )


def test_detect_filetype_maps_file_to_bytes_io_when_spooled_temp_file_used(mocker):
    detect_filetype_mock = MagicMock(return_value=FileType.JSON)
    mocker.patch("unstructured.file_utils.filetype._FileTypeDetector", detect_filetype_mock)
    with tempfile.SpooledTemporaryFile() as f:
        f.write(b'{"text": Hello, world!}')
        f.seek(0)
        detect_filetype(file=f)
    file_detection_context = detect_filetype_mock.file_type.call_args[0][0]
    assert file_detection_context.text_head == '{"text": Hello, world!}'


# -- .languages -----------------------------------------------------------


def test_auto_partition_passes_user_provided_languages_arg_to_PDF():
    elements = partition(
        example_doc_path("pdf/chevron-page.pdf"),
        strategy=PartitionStrategy.OCR_ONLY,
        languages=["eng"],
    )
    assert all(e.metadata.languages == ["eng"] for e in elements)


def test_auto_partition_languages_argument_default_to_None_when_omitted():
    elements = partition(example_doc_path("handbook-1p.docx"), detect_language_per_element=True)
    # -- PageBreak and any other element with no text is assigned `None` --
    assert all(e.text == "" for e in elements if e.metadata.languages is None)


def test_auto_partition_default_does_not_overwrite_other_defaults():
    """`partition()` ["eng"] default does not overwrite ["auto"] default in other partitioners."""
    # the default for `languages` is ["auto"] in partiton_text
    from unstructured.partition.text import partition_text

    # Use a document that is primarily in a language other than English
    file_path = example_doc_path("language-docs/UDHR_first_article_all.txt")
    text_elements = partition_text(file_path)
    assert text_elements[0].metadata.languages != ["eng"]

    auto_elements = partition(file_path)
    assert auto_elements[0].metadata.languages != ["eng"]
    assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages


# ================================================================================================
# MISCELLANEOUS BEHAVIORS
# ================================================================================================


def test_auto_partition_from_filename_works_on_empty_file():
    assert partition(example_doc_path("empty.txt")) == []


def test_auto_partition_from_file_works_on_empty_file():
    with open(example_doc_path("empty.txt"), "rb") as f:
        assert partition(file=f) == []


def test_auto_partition_that_requires_extras_raises_when_dependencies_are_not_installed(
    request: FixtureRequest,
):
    _PartitionerLoader._partitioners.pop(FileType.PDF, None)
    dependency_exists_ = function_mock(
        request, "unstructured.partition.auto.dependency_exists", return_value=False
    )
    match = r"partition_pdf\(\) is not available because one or more dependencies are not installed"
    with pytest.raises(ImportError, match=match):
        partition(example_doc_path("pdf/layout-parser-paper-fast.pdf"))

    dependency_exists_.assert_called_once_with("pdf2image")


# ================================================================================================
# MODULE-LEVEL FIXTURES
# ================================================================================================


@pytest.fixture()
def expected_docx_elements():
    return [
        Title("These are a few of my favorite things:"),
        ListItem("Parrots"),
        ListItem("Hockey"),
        Text("Analysis"),
        NarrativeText("This is my first thought. This is my second thought."),
        NarrativeText("This is my third thought."),
        Text("2023"),
        Address("DOYLESTOWN, PA 18901"),
    ]


def _test_partition_foo():
    pass


def test_auto_partition_works_with_custom_types(
    request: FixtureRequest,
):
    file_type = create_file_type("FOO", canonical_mime_type="application/foo", extensions=[".foo"])

    register_partitioner(file_type)(_test_partition_foo)
    loader = _PartitionerLoader()
    assert loader.get(file_type) is _test_partition_foo
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
+								# pyright: reportPrivateUsage=false
-												rfctr: flatten test_unstructured/partition (#3073)

**Summary**
Some partitioner test modules are placed in directories by themselves or
with one other test module. This unnecessarily obscures where to find
the test module corresponding to a partitiner.

Move partitioner test modules to mirror the directory structure of
`unstructured/partition`.
											
										
										
											2024-05-22 17:51:08 -07:00
+								from __future__ import annotations
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								import json
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								import os
 								import pathlib
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								import tempfile
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
+								import warnings
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								from importlib import import_module
-												rfctr(part): prepare for pluggable auto-partitioners 1 (#3655)

**Summary**
In preparation for pluggable auto-partitioners simplify metadata as
discussed.

**Additional Context**
- Pluggable auto-partitioners requires partitioners to have a consistent
call signature. An arbitrary partitioner provided at runtime needs to
have a call signature that is known and consistent. Basically
`partition_x(filename, *, file, **kwargs)`.
- The current `auto.partition()` is highly coupled to each distinct
file-type partitioner, deciding which arguments to forward to each.
- This is driven by the existence of "delegating" partitioners, those
that convert their file-type and then call a second partitioner to do
the actual partitioning. Both the delegating and proxy partitioners are
decorated with metadata-post-processing decorators and those decorators
are not idempotent. We call the situation where those decorators would
run twice "double-decorating". For example, EPUB converts to HTML and
calls `partition_html()` and both `partition_epub()` and
`partition_html()` are decorated.
- The way double-decorating has been avoided in the past is to avoid
sending the arguments the metadata decorators are sensitive to to the
proxy partitioner. This is very obscure, complex to reason about,
error-prone, and just overall not a viable strategy. The better solution
is to not decorate delegating partitioners and let the proxy partitioner
handle all the metadata.
- This first step in preparation for that is part of simplifying the
metadata processing by removing unused or unwanted legacy parameters.
- `date_from_file_object` is a misnomer because a file-object never
contains last-modified data.
- It can never produce useful results in the API where last-modified
information must be provided by `metadata_last_modified`.
- It is an undocumented parameter so not in use.
- Using it can produce incorrect metadata.
											
										
										
											2024-09-23 15:23:10 -07:00
+								from typing import Iterator
-												Fix file detection when spooled file is pased (#3932)

This pull request fixes the scenario when SpooledTemporaryFile is passed
to detect_file type. In such cases some weird number was assigned as
'name' (and it couldn't be overwritten as SpooledTemporaryFile can't
have fields assigned 😩 ) so I added in our object factory just another
scenario where we parse this type of file.
For BytesIo `name` attr is None as it should be and some other metadata
fields are leveraged for file type recognition
											
										
										
											2025-02-20 14:00:25 +01:00
+								from unittest.mock import MagicMock, patch
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								import pytest
-												enhancement: add support from bitmap images (#2414)

### Summary

Adds support for bitmap images (`.bmp`) in both file detection and
partitioning. Bitmap images will be processed with `partition_image`
just like JPGs and PNGs.

### Testing

```python
from unstructured.file_utils.filetype import detect_filetype
from unstructured.partition.auto import partition
from PIL import Image

filename = "example-docs/layout-parser-paper-with-table.jpg"
bmp_filename = "~/tmp/ayout-parser-paper-with-table.bmp"

img = Image.open(filename)
img.save(bmp_filename)

detect_filetype(filename=bmp_filename) # Should be FileType.BMP

elements = partition(filename=bmp_filename)
```
											
										
										
											2024-01-17 17:50:36 -05:00
+								from PIL import Image
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
-												fix: stop csv and tsv dropping the first line of the file (#1530)

The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.

Here is a snippet of code that demonstrates the current behavior and the
proposed fix

```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring

c1 = """
    Stanley Cups,,
    Team,Location,Stanley Cups
    Blues,STL,1
    Flyers,PHI,2
    Maple Leafs,TOR,13
    """

f = "./test.csv"
with open(f, 'w') as ff:
    ff.write(c1)
  
print("Suggested Improvement Keep First Line") 
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)

print("\n\nOriginal Looses First Line") 
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-17 00:59:35 +02:00
+								from test_unstructured.partition.test_constants import (
 								    EXPECTED_TABLE,
 								    EXPECTED_TABLE_XLSX,
 								    EXPECTED_TEXT,
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    EXPECTED_XLS_TABLE,
-												fix: stop csv and tsv dropping the first line of the file (#1530)

The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.

Here is a snippet of code that demonstrates the current behavior and the
proposed fix

```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring

c1 = """
    Stanley Cups,,
    Team,Location,Stanley Cups
    Blues,STL,1
    Flyers,PHI,2
    Maple Leafs,TOR,13
    """

f = "./test.csv"
with open(f, 'w') as ff:
    ff.write(c1)
  
print("Suggested Improvement Keep First Line") 
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)

print("\n\nOriginal Looses First Line") 
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
											
										
										
											2023-10-17 00:59:35 +02:00
+								)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								from test_unstructured.unit_utils import (
 								    ANY,
 								    FixtureRequest,
 								    LogCaptureFixture,
 								    example_doc_path,
 								    function_mock,
 								    method_mock,
 								)
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								from unstructured.cleaners.core import clean_extra_whitespace
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								from unstructured.documents.elements import (
 								    Address,
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    CompositeElement,
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
+								    Element,
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								    ElementMetadata,
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ListItem,
 								    NarrativeText,
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								    Table,
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
+								    TableChunk,
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    Text,
 								    Title,
 								)
-												Fix file detection when spooled file is pased (#3932)

This pull request fixes the scenario when SpooledTemporaryFile is passed
to detect_file type. In such cases some weird number was assigned as
'name' (and it couldn't be overwritten as SpooledTemporaryFile can't
have fields assigned 😩 ) so I added in our object factory just another
scenario where we parse this type of file.
For BytesIo `name` attr is None as it should be and some other metadata
fields are leveraged for file type recognition
											
										
										
											2025-02-20 14:00:25 +01:00
+								from unstructured.file_utils.filetype import detect_filetype
-												Enable dynamic file type registration (#3946)

The purpose of this PR is to enable registering new file types
dynamically.

The PR enables this through 2 primary functions:

1. `unstructured.file_utils.model.create_file_type` This registers the
new `FileType` enum which enables the rest of unstructured to understand
a new type of file
2. `unstructured.file_utils.model.register_partitioner` Decorator that
enables registering a partitioner function to run for a file type.

---------

Co-authored-by: Roman Isecke <136338424+rbiseck3@users.noreply.github.com>
											
										
										
											2025-03-06 17:09:42 -05:00
+								from unstructured.file_utils.model import FileType, create_file_type, register_partitioner
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
+								from unstructured.partition.auto import _PartitionerLoader, partition
-												rfctr(email): eml partitioner rewrite (#3694)

**Summary**
Initial attempts to incrementally refactor `partition_email()` into
shape to allow pluggable partitioning quickly became too complex for
ready code-review. Prepare separate rewritten module and tests and swap
them out whole.

**Additional Context**
- Uses the modern stdlib `email` module to reliably accomplish several
manual decoding steps in the legacy code.
- Remove obsolete email-specific element-types which were replaced 18
months or so ago with email-specific metadata fields for things like Cc:
addresses, subject, etc.
- Remove accepting an email as `text: str` because MIME-email is
inherently a binary format which can and often does contain multiple and
contradictory character-encodings.
- Remove `encoding` parameters as it is now unused. An email file is not
a text file and as such does not have a single overall encoding.
Character encoding is specified individually for each MIME-part within
the message and often varies from one part to another in the same
message.
- Remove the need for a caller to specify `attachment_partitioner`.
There is only one reasonable choice for this which is
`auto.partition()`, consistent with the same interface and operation in
`partition_msg()`.
- Fixes #3671 along the way by silently skipping attachments with a
file-type for which there is no partitioner.
- Substantially extend the test-suite to cover multiple
transport-encoding/charset combinations.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: scanny <scanny@users.noreply.github.com>
											
										
										
											2024-10-15 19:02:33 -07:00
+								from unstructured.partition.common import UnsupportedFileFormatError
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								from unstructured.partition.utils.constants import PartitionStrategy
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json
-												Bug/635 unicode decode error eml (#739)

* Adds functionality to extract charset info from eml files
* Adds missed file-like object handling in detect_file_encoding
* Adds functionality to replace the MIME encodings for eml files with one of the
   common encodings if a unicode error occurs
* Organize the eml example files in the example-docs/eml directory

											
										
										
											2023-06-16 17:52:13 -07:00
-												fix: correct order of kwargs in pandoc (#421)

* fix: correct order of kwargs in pandoc

* only skip epub tests in Docker

* changelog

---------

Co-authored-by: Crag Wolfe <crag@unstructuredai.io>
Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-30 16:54:29 -04:00
+								is_in_docker = os.path.exists("/.dockerenv")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# CSV
 								# ================================================================================================
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_csv_from_filename():
 								    elements = partition(example_doc_path("stanley-cups.csv"))
 								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 								    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
 								    assert elements[0].metadata.filetype == "text/csv"
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_csv_from_file():
 								    with open(example_doc_path("stanley-cups.csv"), "rb") as f:
 								        elements = partition(file=f)
 								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 								    assert isinstance(elements[0], Table)
 								    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
 								    assert elements[0].metadata.filetype == "text/csv"
 								# ================================================================================================
 								# DOC
 								# ================================================================================================
 								@pytest.mark.parametrize(
 								    ("pass_metadata_filename", "content_type"),
 								    [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
 								)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_doc_from_filename(
 								    pass_metadata_filename: bool, content_type: str | None, expected_docx_elements: list[Element]
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								):
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    file_path = example_doc_path("simple.doc")
 								    metadata_filename = file_path if pass_metadata_filename else None
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    elements = partition(
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								        filename=file_path,
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        metadata_filename=metadata_filename,
 								        content_type=content_type,
 								        strategy=PartitionStrategy.HI_RES,
 								    )
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
 								    for e in elements:
 								        print(f"{type(e).__name__}({repr(e.text)})")
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements == expected_docx_elements
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert all(e.metadata.filename == "simple.doc" for e in elements)
 								    assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								@pytest.mark.skipif(is_in_docker, reason="Passes in CI but not Docker. Remove skip on #3364 fix.")
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_doc_from_file(expected_docx_elements: list[Element]):
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    with open(example_doc_path("simple.doc"), "rb") as f:
 								        elements = partition(file=f)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert elements == expected_docx_elements
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								# ================================================================================================
 								# DOCX
 								# ================================================================================================
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_docx_from_filename(expected_docx_elements: list[Element]):
 								    elements = partition(example_doc_path("simple.docx"), strategy=PartitionStrategy.HI_RES)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								    assert elements == expected_docx_elements
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert all(e.metadata.filename == "simple.docx" for e in elements)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_docx_from_file(expected_docx_elements: list[Element]):
 								    with open(example_doc_path("simple.docx"), "rb") as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert elements == expected_docx_elements
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								@pytest.mark.parametrize("file_name", ["simple.docx", "simple.doc", "simple.odt"])
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
+								@pytest.mark.parametrize(
 								    "strategy",
 								    [
 								        PartitionStrategy.AUTO,
 								        PartitionStrategy.FAST,
 								        PartitionStrategy.HI_RES,
 								        PartitionStrategy.OCR_ONLY,
 								    ],
 								)
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers(
 								    request: FixtureRequest, file_name: str, strategy: str
 								):
 								    """The `strategy` arg value received by `partition()` is received by `partition_docx().
 								    To do this in the brokering-partitioner cases (DOC, ODT) it must make its way to
 								    `partition_doc()` or `partition_odt()` which must then forward it to `partition_docx()`. This
 								    test makes sure it made it all the way.
 								    Note this is 3 file-types X 4 strategies = 12 test-cases.
 								    """
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
+								    from unstructured.partition.docx import _DocxPartitioner
 								    def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
 								        yield Text(f"strategy=={self._opts.strategy}")
 								    _iter_elements_ = method_mock(
 								        request,
 								        _DocxPartitioner,
 								        "_iter_document_elements",
 								        side_effect=fake_iter_document_elements,
 								    )
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								    (element,) = partition(example_doc_path(file_name), strategy=strategy)
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
 								    _iter_elements_.assert_called_once_with(ANY)
 								    assert element.text == f"strategy=={strategy}"
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# EML
 								# ================================================================================================
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								EXPECTED_EMAIL_OUTPUT = [
 								    NarrativeText(text="This is a test email to use for unit tests."),
-												fix: html incorrectly categorizing text (#3841)

Fixes #3666

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: scanny <scanny@users.noreply.github.com>
											
										
										
											2024-12-18 10:46:54 -08:00
+								    Text(text="Important points:"),
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    ListItem(text="Roses are red"),
 								    ListItem(text="Violets are blue"),
 								]
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								def test_auto_partition_email_from_filename():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("eml/fake-email.eml")
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_EMAIL_OUTPUT
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    assert elements[0].metadata.filename == os.path.basename(file_path)
 								    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								def test_auto_partition_email_from_file():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_EMAIL_OUTPUT
 								# ================================================================================================
 								# EPUB
 								# ================================================================================================
 								def test_auto_partition_epub_from_filename():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert len(elements) > 0
-												feat: include images when partitioning html (#3945)

Currently we [filter img
tags](https://github.com/Unstructured-IO/unstructured/blob/2addb19473ba9e27af995291f57d35fb50bec4b0/unstructured/partition/html/partition.py#L226-L229)
before tags are converted to Elements by the html partitioner. More
importantly we also don’t currently have a defined “block” / mapping to
support these. This adds these mappings and logic to process.

It also respects `extract_image_block_types` and
`extract_image_block_to_payload` (as we do with pdfs) to determine
whether base64 is included in the metadata.

The partitioned Image Elements sets the text to the img tag’s alt text
if available.

The partitioned Image Elements include the [url in the
metadata](https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/documents/elements.py#L209)
(rather than image_base64) if the img tag src is a url.

## Testing

unit tests have been added for explicit coverage.
existing integration tests and other unit test fixtures have been
updated to account for `Image` elements now present

---------

Co-authored-by: ryannikolaidis <ryannikolaidis@users.noreply.github.com>
											
										
										
											2025-03-07 17:25:21 -08:00
+								    assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								def test_auto_partition_epub_from_file():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    with open(example_doc_path("winter-sports.epub"), "rb") as f:
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert len(elements) > 0
-												feat: include images when partitioning html (#3945)

Currently we [filter img
tags](https://github.com/Unstructured-IO/unstructured/blob/2addb19473ba9e27af995291f57d35fb50bec4b0/unstructured/partition/html/partition.py#L226-L229)
before tags are converted to Elements by the html partitioner. More
importantly we also don’t currently have a defined “block” / mapping to
support these. This adds these mappings and logic to process.

It also respects `extract_image_block_types` and
`extract_image_block_to_payload` (as we do with pdfs) to determine
whether base64 is included in the metadata.

The partitioned Image Elements sets the text to the img tag’s alt text
if available.

The partitioned Image Elements include the [url in the
metadata](https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/documents/elements.py#L209)
(rather than image_base64) if the img tag src is a url.

## Testing

unit tests have been added for explicit coverage.
existing integration tests and other unit test fixtures have been
updated to account for `Image` elements now present

---------

Co-authored-by: ryannikolaidis <ryannikolaidis@users.noreply.github.com>
											
										
										
											2025-03-07 17:25:21 -08:00
+								    assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								# ================================================================================================
 								# HTML
 								# ================================================================================================
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
 								)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content_type: str | None):
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    file_path = example_doc_path("example-10k-1p.html")
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    metadata_filename = file_path if pass_metadata_filename else None
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        filename=file_path,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								        metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        content_type=content_type,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        strategy=PartitionStrategy.HI_RES,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    )
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
 								    assert elements
 								    expected_filename, expected_directory = os.path.basename(file_path), os.path.split(file_path)[0]
 								    assert all(e.metadata.filename == expected_filename for e in elements)
 								    assert all(e.metadata.file_directory == expected_directory for e in elements)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
 								)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_type: str | None):
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    file_path = example_doc_path("example-10k-1p.html")
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    metadata_filename = file_path if pass_metadata_filename else None
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    with open(file_path, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(
 								            file=f,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								            metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								            content_type=content_type,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								            strategy=PartitionStrategy.HI_RES,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        )
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								    assert len(elements) > 0
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_html_pre_from_file():
 								    elements = partition(example_doc_path("fake-html-pre.htm"))
 								    assert len(elements) > 0
 								    assert "PageBreak" not in [elem.category for elem in elements]
 								    assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
 								    assert isinstance(elements[0], NarrativeText)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert all(e.metadata.filetype == "text/html" for e in elements)
 								    assert all(e.metadata.filename == "fake-html-pre.htm" for e in elements)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								# ================================================================================================
 								# IMAGE
 								# ================================================================================================
 								@pytest.mark.parametrize(
 								    ("pass_metadata_filename", "content_type"),
 								    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 								)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_jpeg_from_filename(pass_metadata_filename: bool, content_type: str | None):
-												refactor: restructure PDF/Image example document organization (#3410)

This PR aims to improve the organization and readability of our example
documents used in unit tests, specifically focusing on PDF and image
files.

### Summary
- Created two new subdirectories in the `example-docs` folder:
  - `pdf/`: for all PDF example files
  - `img/`: for all image example files
- Moved relevant PDF files from `example-docs/` to `example-docs/pdf/`
- Moved relevant image files from `example-docs/` to `example-docs/img/`
- Updated file paths in affected unit & ingest tests to reflect the new
directory structure

### Testing
All unit & ingest tests should be updated and verified to work with the
new file structure.

## Notes
Other file types (e.g., office documents, HTML files) remain in the root
of `example-docs/` for now.

## Next Steps
Consider similar reorganization for other file types if this structure
proves to be beneficial.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-07-18 15:21:32 -07:00
+								    file_path = example_doc_path("img/layout-parser-paper-fast.jpg")
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    metadata_filename = file_path if pass_metadata_filename else None
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    elements = partition(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        filename=file_path,
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        metadata_filename=metadata_filename,
 								        content_type=content_type,
 								        strategy=PartitionStrategy.AUTO,
 								    )
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    e = elements[2]
 								    assert e.text == (
 								        "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    )
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert e.metadata.coordinates is not None
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								@pytest.mark.parametrize(
 								    ("pass_metadata_filename", "content_type"),
 								    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 								)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_jpeg_from_file(pass_metadata_filename: bool, content_type: str | None):
-												refactor: restructure PDF/Image example document organization (#3410)

This PR aims to improve the organization and readability of our example
documents used in unit tests, specifically focusing on PDF and image
files.

### Summary
- Created two new subdirectories in the `example-docs` folder:
  - `pdf/`: for all PDF example files
  - `img/`: for all image example files
- Moved relevant PDF files from `example-docs/` to `example-docs/pdf/`
- Moved relevant image files from `example-docs/` to `example-docs/img/`
- Updated file paths in affected unit & ingest tests to reflect the new
directory structure

### Testing
All unit & ingest tests should be updated and verified to work with the
new file structure.

## Notes
Other file types (e.g., office documents, HTML files) remain in the root
of `example-docs/` for now.

## Next Steps
Consider similar reorganization for other file types if this structure
proves to be beneficial.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-07-18 15:21:32 -07:00
+								    file_path = example_doc_path("img/layout-parser-paper-fast.jpg")
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    metadata_filename = file_path if pass_metadata_filename else None
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    with open(file_path, "rb") as f:
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        elements = partition(
 								            file=f,
 								            metadata_filename=metadata_filename,
 								            content_type=content_type,
 								            strategy=PartitionStrategy.AUTO,
 								        )
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    e = elements[2]
 								    assert e.text == (
 								        "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
 								    )
 								    assert e.metadata.coordinates is not None
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
 								def test_auto_partition_bmp_from_filename(tmp_path: pathlib.Path):
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    bmp_filename = str(tmp_path / "example.bmp")
-												refactor: restructure PDF/Image example document organization (#3410)

This PR aims to improve the organization and readability of our example
documents used in unit tests, specifically focusing on PDF and image
files.

### Summary
- Created two new subdirectories in the `example-docs` folder:
  - `pdf/`: for all PDF example files
  - `img/`: for all image example files
- Moved relevant PDF files from `example-docs/` to `example-docs/pdf/`
- Moved relevant image files from `example-docs/` to `example-docs/img/`
- Updated file paths in affected unit & ingest tests to reflect the new
directory structure

### Testing
All unit & ingest tests should be updated and verified to work with the
new file structure.

## Notes
Other file types (e.g., office documents, HTML files) remain in the root
of `example-docs/` for now.

## Next Steps
Consider similar reorganization for other file types if this structure
proves to be beneficial.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-07-18 15:21:32 -07:00
+								    with Image.open(example_doc_path("img/layout-parser-paper-with-table.jpg")) as img:
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        img.save(bmp_filename)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    elements = partition(filename=bmp_filename, strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								    table = [e.metadata.text_as_html for e in elements if e.metadata.text_as_html]
 								    assert len(table) == 1
 								    assert "<table><thead><tr>" in table[0]
 								    assert "</thead><tbody><tr>" in table[0]
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
 								def test_auto_partition_image_element_extraction(extract_image_block_to_payload: bool):
 								    extract_image_block_types = ["Image", "Table"]
 								    with tempfile.TemporaryDirectory() as tmpdir:
 								        elements = partition(
-												refactor: restructure PDF/Image example document organization (#3410)

This PR aims to improve the organization and readability of our example
documents used in unit tests, specifically focusing on PDF and image
files.

### Summary
- Created two new subdirectories in the `example-docs` folder:
  - `pdf/`: for all PDF example files
  - `img/`: for all image example files
- Moved relevant PDF files from `example-docs/` to `example-docs/pdf/`
- Moved relevant image files from `example-docs/` to `example-docs/img/`
- Updated file paths in affected unit & ingest tests to reflect the new
directory structure

### Testing
All unit & ingest tests should be updated and verified to work with the
new file structure.

## Notes
Other file types (e.g., office documents, HTML files) remain in the root
of `example-docs/` for now.

## Next Steps
Consider similar reorganization for other file types if this structure
proves to be beneficial.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-07-18 15:21:32 -07:00
+								            filename=example_doc_path("img/embedded-images-tables.jpg"),
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								            extract_image_block_types=extract_image_block_types,
 								            extract_image_block_to_payload=extract_image_block_to_payload,
 								            extract_image_block_output_dir=tmpdir,
 								        )
 								        assert_element_extraction(
 								            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
 								        )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# JSON
 								# ================================================================================================
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								# TODO(scanny): This test should go away when we fix #3365. This test glosses over several
 								# important JSON "rehydration" behaviors, in particular that the metadata should match exactly.
 								# The following test `test_auto_partition_json_from_file_preserves_original_elements` will be the
 								# replacement for this test.
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements():
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								    """Test auto-processing an unstructured json output file by filename."""
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    json_file_path = example_doc_path("spring-weather.html.json")
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								    original_file_name = "spring-weather.html"
 								    with open(json_file_path) as json_f:
 								        expected_result = json.load(json_f)
 								    partitioning_result = json.loads(
-												rfctr(part): prepare for pluggable auto-partitioners 1 (#3655)

**Summary**
In preparation for pluggable auto-partitioners simplify metadata as
discussed.

**Additional Context**
- Pluggable auto-partitioners requires partitioners to have a consistent
call signature. An arbitrary partitioner provided at runtime needs to
have a call signature that is known and consistent. Basically
`partition_x(filename, *, file, **kwargs)`.
- The current `auto.partition()` is highly coupled to each distinct
file-type partitioner, deciding which arguments to forward to each.
- This is driven by the existence of "delegating" partitioners, those
that convert their file-type and then call a second partitioner to do
the actual partitioning. Both the delegating and proxy partitioners are
decorated with metadata-post-processing decorators and those decorators
are not idempotent. We call the situation where those decorators would
run twice "double-decorating". For example, EPUB converts to HTML and
calls `partition_html()` and both `partition_epub()` and
`partition_html()` are decorated.
- The way double-decorating has been avoided in the past is to avoid
sending the arguments the metadata decorators are sensitive to to the
proxy partitioner. This is very obscure, complex to reason about,
error-prone, and just overall not a viable strategy. The better solution
is to not decorate delegating partitioners and let the proxy partitioner
handle all the metadata.
- This first step in preparation for that is part of simplifying the
metadata processing by removing unused or unwanted legacy parameters.
- `date_from_file_object` is a misnomer because a file-object never
contains last-modified data.
- It can never produce useful results in the API where last-modified
information must be provided by `metadata_last_modified`.
- It is an undocumented parameter so not in use.
- Using it can produce incorrect metadata.
											
										
										
											2024-09-23 15:23:10 -07:00
+								        elements_to_json(
 								            partition(
 								                filename=str(json_file_path),
 								                # -- use the original file name to get the same element IDs (hashes) --
 								                metadata_filename=original_file_name,
 								                strategy=PartitionStrategy.HI_RES,
 								            )
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								        )
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								    )
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								    for elem in partitioning_result:
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        elem.pop("metadata")
-												Better element IDs - deterministic and document-unique hashes (#2673)

Part two of: https://github.com/Unstructured-IO/unstructured/pull/2842

Main changes compared to part one:
* hash computation includes element's sequence number on page, page
number, document filename and its text
* there are more test for deterministic behavior of IDs returned by
partitioning functions + their uniqueness (guaranteed at the document
level, and high probability across multiple documents)

This PR addresses the following issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

											
										
										
											2024-04-24 09:05:20 +02:00
+								    for elem in expected_result:
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        elem.pop("metadata")
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert expected_result == partitioning_result
-												refactor: simplifies JSON detection and add tests (#975)

* refactor json detection

* version and changelog

* fix mock in test
											
										
										
											2023-07-25 15:59:45 -04:00
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								@pytest.mark.xfail(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    reason=(
 								        "https://github.com/Unstructured-IO/unstructured/issues/3365"
 								        " partition_json() does not preserve original element-id or metadata"
 								    ),
 								    raises=AssertionError,
 								    strict=True,
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								)
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								def test_auto_partition_json_from_file_preserves_original_elements():
 								    file_path = example_doc_path("simple.json")
 								    original_elements = elements_from_json(file_path)
 								    with open(file_path, "rb") as f:
 								        partitioned_elements = partition(file=f)
 								    assert elements_to_dicts(partitioned_elements) == elements_to_dicts(original_elements)
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
-												Fix json bytes content type detection (#3941)

Fixes order of content type detection strategies for byte-encoded jsons.

Before
```
json_bytes = json.dumps([{"example": "data"}]).encode("utf-8")
file_buffer = io.BytesIO(json_bytes)
detect_filetype(file=file_buffer, metadata_file_path="filename.pdf") 
```

Before
PDF

Now
JSON
											
										
										
											2025-03-07 11:33:33 +01:00
+								def test_auto_partition_processes_simple_ndjson(tmp_path: pathlib.Path):
 								    text = '{"text": "hello", "type": "NarrativeText"}'
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    file_path = str(tmp_path / "unprocessable.json")
 								    with open(file_path, "w") as f:
 								        f.write(text)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												Fix json bytes content type detection (#3941)

Fixes order of content type detection strategies for byte-encoded jsons.

Before
```
json_bytes = json.dumps([{"example": "data"}]).encode("utf-8")
file_buffer = io.BytesIO(json_bytes)
detect_filetype(file=file_buffer, metadata_file_path="filename.pdf") 
```

Before
PDF

Now
JSON
											
										
										
											2025-03-07 11:33:33 +01:00
+								    result = partition(filename=file_path)
 								    assert len(result) == 1
 								    assert isinstance(result[0], NarrativeText)
 								    assert "hello" in result[0].text
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								# ================================================================================================
 								# MD
 								# ================================================================================================
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_partition_md_from_url_works_with_embedded_html():
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
 								    elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)
-												feat: include images when partitioning html (#3945)

Currently we [filter img
tags](https://github.com/Unstructured-IO/unstructured/blob/2addb19473ba9e27af995291f57d35fb50bec4b0/unstructured/partition/html/partition.py#L226-L229)
before tags are converted to Elements by the html partitioner. More
importantly we also don’t currently have a defined “block” / mapping to
support these. This adds these mappings and logic to process.

It also respects `extract_image_block_types` and
`extract_image_block_to_payload` (as we do with pdfs) to determine
whether base64 is included in the metadata.

The partitioned Image Elements sets the text to the img tag’s alt text
if available.

The partitioned Image Elements include the [url in the
metadata](https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/documents/elements.py#L209)
(rather than image_base64) if the img tag src is a url.

## Testing

unit tests have been added for explicit coverage.
existing integration tests and other unit test fixtures have been
updated to account for `Image` elements now present

---------

Co-authored-by: ryannikolaidis <ryannikolaidis@users.noreply.github.com>
											
										
										
											2025-03-07 17:25:21 -08:00
+								    assert "unstructured" in elements[1].text
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								# ================================================================================================
 								# MSG
 								# ================================================================================================
 								def test_auto_partition_msg_from_filename():
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert partition(example_doc_path("fake-email.msg"), strategy=PartitionStrategy.HI_RES) == [
 								        NarrativeText(text="This is a test email to use for unit tests."),
-												fix: html incorrectly categorizing text (#3841)

Fixes #3666

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: scanny <scanny@users.noreply.github.com>
											
										
										
											2024-12-18 10:46:54 -08:00
+								        Text(text="Important points:"),
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								        ListItem(text="Roses are red"),
 								        ListItem(text="Violets are blue"),
 								    ]
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# ODT
 								# ================================================================================================
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_odt_from_filename(expected_docx_elements: list[Element]):
 								    elements = partition(example_doc_path("simple.odt"), strategy=PartitionStrategy.HI_RES)
 								    assert elements == expected_docx_elements
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_odt_from_file(expected_docx_elements: list[Element]):
 								    with open(example_doc_path("simple.odt"), "rb") as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert elements == expected_docx_elements
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								# ================================================================================================
 								# ORG
 								# ================================================================================================
 								def test_auto_partition_org_from_filename():
 								    elements = partition(example_doc_path("README.org"))
 								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/org"
 								def test_auto_partition_org_from_file():
 								    with open(example_doc_path("README.org"), "rb") as f:
 								        elements = partition(file=f, content_type="text/org")
 								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/org"
 								# ================================================================================================
 								# PDF
 								# ================================================================================================
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
 								)
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
-												feat: improve pdfminer element processing (#3618)

This PR implements splitting of `pdfminer` elements (`groups of text
chunks`) into smaller bounding boxes (`text lines`). This implementation
prevents loss of information from the object detection model and
facilitates more effective removal of duplicated `pdfminer` text. This
PR also addresses #3430.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-09-12 14:17:27 -07:00
+								    file_path = example_doc_path("pdf/chevron-page.pdf")
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    metadata_filename = file_path if pass_metadata_filename else None
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        filename=file_path,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								        metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        content_type=content_type,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        strategy=PartitionStrategy.HI_RES,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    )
-												chore: return `Element` objects in `partition_pdf` and `partition_image` (#164)

* helper function to convert to element

* test for element types

* fix for healthcheck url

* version bump

* note on coordinates

* mention FigureCaption

* test_shared -> test_common

* add check boxes for checkbox template

* update changelog
											
										
										
											2023-01-19 09:29:28 -05:00
-												feat: improve pdfminer element processing (#3618)

This PR implements splitting of `pdfminer` elements (`groups of text
chunks`) into smaller bounding boxes (`text lines`). This implementation
prevents loss of information from the object detection model and
facilitates more effective removal of duplicated `pdfminer` text. This
PR also addresses #3430.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-09-12 14:17:27 -07:00
+								    e = elements[0]
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    assert isinstance(e, Title)
-												feat: improve pdfminer element processing (#3618)

This PR implements splitting of `pdfminer` elements (`groups of text
chunks`) into smaller bounding boxes (`text lines`). This implementation
prevents loss of information from the object detection model and
facilitates more effective removal of duplicated `pdfminer` text. This
PR also addresses #3430.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-09-12 14:17:27 -07:00
+								    assert e.text.startswith("eastern mediterranean")
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    assert e.metadata.filename == os.path.basename(file_path)
 								    assert e.metadata.file_directory == os.path.split(file_path)[0]
-												feat: add metadata tracking to document elements (#225)

* add metadata field to elements

* metadata tracking for pdf/image

* metadata for html

* update expected outputs

* metadata for the rest of the document types

* take out file metadata for now

* add url to tables

* added metadata to test_auto

* bump version

* added coordinates to __init__

* fix coordinates in tests
											
										
										
											2023-02-15 13:26:20 -05:00
-												feat: improve pdfminer element processing (#3618)

This PR implements splitting of `pdfminer` elements (`groups of text
chunks`) into smaller bounding boxes (`text lines`). This implementation
prevents loss of information from the object detection model and
facilitates more effective removal of duplicated `pdfminer` text. This
PR also addresses #3430.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-09-12 14:17:27 -07:00
+								    e = elements[1]
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    assert isinstance(e, NarrativeText)
-												feat: improve pdfminer element processing (#3618)

This PR implements splitting of `pdfminer` elements (`groups of text
chunks`) into smaller bounding boxes (`text lines`). This implementation
prevents loss of information from the object detection model and
facilitates more effective removal of duplicated `pdfminer` text. This
PR also addresses #3430.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-09-12 14:17:27 -07:00
+								    assert e.text.startswith("We’re investing")
-												build(deps): update inference version (#662)

Updated to the the latest version of unstructured-inference. detectron2 now gets implemented with onnxruntime, yay!

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
											
										
										
											2023-05-31 13:50:15 -05:00
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								@pytest.mark.parametrize(
 								    ("pass_metadata_filename", "content_type"),
 								    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
 								)
 								def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
-												feat: improve pdfminer element processing (#3618)

This PR implements splitting of `pdfminer` elements (`groups of text
chunks`) into smaller bounding boxes (`text lines`). This implementation
prevents loss of information from the object detection model and
facilitates more effective removal of duplicated `pdfminer` text. This
PR also addresses #3430.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-09-12 14:17:27 -07:00
+								    file_path = example_doc_path("pdf/chevron-page.pdf")
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    metadata_filename = file_path if pass_metadata_filename else None
 								    with open(file_path, "rb") as f:
 								        elements = partition(
 								            file=f,
 								            metadata_filename=metadata_filename,
 								            content_type=content_type,
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								            strategy=PartitionStrategy.HI_RES,
 								        )
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												feat: improve pdfminer element processing (#3618)

This PR implements splitting of `pdfminer` elements (`groups of text
chunks`) into smaller bounding boxes (`text lines`). This implementation
prevents loss of information from the object detection model and
facilitates more effective removal of duplicated `pdfminer` text. This
PR also addresses #3430.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-09-12 14:17:27 -07:00
+								    e = elements[0]
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert isinstance(e, Title)
-												feat: improve pdfminer element processing (#3618)

This PR implements splitting of `pdfminer` elements (`groups of text
chunks`) into smaller bounding boxes (`text lines`). This implementation
prevents loss of information from the object detection model and
facilitates more effective removal of duplicated `pdfminer` text. This
PR also addresses #3430.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-09-12 14:17:27 -07:00
+								    assert e.text.startswith("eastern mediterranean")
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												feat: improve pdfminer element processing (#3618)

This PR implements splitting of `pdfminer` elements (`groups of text
chunks`) into smaller bounding boxes (`text lines`). This implementation
prevents loss of information from the object detection model and
facilitates more effective removal of duplicated `pdfminer` text. This
PR also addresses #3430.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-09-12 14:17:27 -07:00
+								    e = elements[1]
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert isinstance(e, NarrativeText)
-												feat: improve pdfminer element processing (#3618)

This PR implements splitting of `pdfminer` elements (`groups of text
chunks`) into smaller bounding boxes (`text lines`). This implementation
prevents loss of information from the object detection model and
facilitates more effective removal of duplicated `pdfminer` text. This
PR also addresses #3430.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-09-12 14:17:27 -07:00
+								    assert e.text.startswith("We’re investing")
-												feat: extract tables (#503)

Exposes table extraction through partition and partition_pdf.
											
										
										
											2023-04-21 12:01:29 -05:00
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
+								def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
 								    partition_pdf_ = function_mock(
 								        request,
 								        "unstructured.partition.pdf.partition_pdf",
 								        return_value=[NarrativeText("Hello there!")],
 								    )
 								    partitioner_loader_get_ = method_mock(
 								        request, _PartitionerLoader, "get", return_value=partition_pdf_
 								    )
-												refactor: restructure PDF/Image example document organization (#3410)

This PR aims to improve the organization and readability of our example
documents used in unit tests, specifically focusing on PDF and image
files.

### Summary
- Created two new subdirectories in the `example-docs` folder:
  - `pdf/`: for all PDF example files
  - `img/`: for all image example files
- Moved relevant PDF files from `example-docs/` to `example-docs/pdf/`
- Moved relevant image files from `example-docs/` to `example-docs/img/`
- Updated file paths in affected unit & ingest tests to reflect the new
directory structure

### Testing
All unit & ingest tests should be updated and verified to work with the
new file structure.

## Notes
Other file types (e.g., office documents, HTML files) remain in the root
of `example-docs/` for now.

## Next Steps
Consider similar reorganization for other file types if this structure
proves to be beneficial.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-07-18 15:21:32 -07:00
+								    file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
+								    partition(file_path, strategy=PartitionStrategy.FAST)
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
+								    partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
 								    partition_pdf_.assert_called_once_with(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        filename=file_path,
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								        file=None,
 								        url=None,
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        strategy=PartitionStrategy.FAST,
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
+								        languages=None,
-												Feat: return base64 encoded images for PDF's (#2310)

Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition

elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										
										
											2023-12-26 21:39:01 -08:00
+								        metadata_filename=None,
-												BREAKING CHANGE: revert table extraction off by default for PDFs and images (#3035)

### Summary

Closes #3021 . Turns table extraction for PDFs and images off by
default. The default behavior originally changed in #2588 . The reason
for reversion is that some users did not realize turning off table
extraction was an option and experience long processing times for PDFs
and images with the new default behavior.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
											
										
										
											2024-05-17 11:28:11 -04:00
+								        infer_table_structure=False,
-												Feat: return base64 encoded images for PDF's (#2310)

Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition

elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										
										
											2023-12-26 21:39:01 -08:00
+								        extract_images_in_pdf=False,
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								        extract_image_block_types=None,
 								        extract_image_block_output_dir=None,
 								        extract_image_block_to_payload=False,
-												chore: add hi_res_model_name kwarg (#2289)

Closes #2160 

Explicitly adds `hi_res_model_name` as kwarg to relevant functions and
notes that `model_name` is to be deprecated.

Testing:
```
from unstructured.partition.auto import partition
filename = "example-docs/DA-1p.pdf"
elements = partition(filename, strategy="hi_res", hi_res_model_name="yolox")
```

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Steve Canny <stcanny@gmail.com>
Co-authored-by: Christine Straub <christinemstraub@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-12-22 09:06:54 -06:00
+								        hi_res_model_name=None,
-												Introduce `start_page` argument to partitioning functions that assign `element.metadata.page_number` (#2884)

This small change will be useful for users who partition only fragments
of their PDF documents.
It's a small step towards addressing this issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

Related PRs:
* https://github.com/Unstructured-IO/unstructured/pull/2842
* https://github.com/Unstructured-IO/unstructured/pull/2673
											
										
										
											2024-04-15 23:03:42 +02:00
+								        starting_page_number=1,
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								    )
-												fix: fix multiple values for infer_table_structure (#3870)

This PR fixes a bug when using `partition` to partition an email with
image attachments with hi_res and allow table structure inference -> the
partitioning of the image would encounter a value error: `got multiple
values for keyword argument 'infer_table_structure'`.

This is because pass `kwargs` into partition "other" types of files in
this
[block](https://github.com/Unstructured-IO/unstructured/blob/50ea6fe7fc324efa09398898dc35d0cd4e78b1cf/unstructured/partition/auto.py#L270-L280)
`infer_table_structure` is packaged into `partitioning_kwargs`. Then for
email at least when there are attachments that can be partitioned with
`hi_res` we pass that dict of `kwargs` right back into `partition` entry
-> so when we get
[here](https://github.com/Unstructured-IO/unstructured/blob/50ea6fe7fc324efa09398898dc35d0cd4e78b1cf/unstructured/partition/auto.py#L222-L235)
we are both specifying explicitly `infer_table_structure` and have it in
`kwargs` variable

The fix is to detect first if `kwargs` already contains
`infer_table_structure` and if yes use that and pop it from `kwargs`.

---------

Co-authored-by: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2025-01-17 12:41:04 -06:00
+								@pytest.mark.parametrize("infer_bool", [True, False])
 								def test_auto_handles_kwarg_with_infer_table_structure(infer_bool):
 								    with patch(
 								        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
 								    ) as mock_process_file_with_model:
 								        partition(
 								            example_doc_path("pdf/layout-parser-paper-fast.pdf"),
 								            pdf_infer_table_structure=True,
 								            strategy=PartitionStrategy.HI_RES,
 								            infer_table_structure=infer_bool,
 								        )
 								        assert mock_process_file_with_model.call_args[1]["infer_table_structure"] is infer_bool
 								def test_auto_handles_kwarg_with_infer_table_structure_when_none():
 								    with patch(
 								        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
 								    ) as mock_process_file_with_model:
 								        partition(
 								            example_doc_path("pdf/layout-parser-paper-fast.pdf"),
 								            pdf_infer_table_structure=True,
 								            strategy=PartitionStrategy.HI_RES,
 								            infer_table_structure=None,
 								        )
 								        assert mock_process_file_with_model.call_args[1]["infer_table_structure"] is True
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_pdf_uses_pdf_infer_table_structure_argument():
 								    with patch(
 								        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
 								    ) as mock_process_file_with_model:
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        partition(
-												refactor: restructure PDF/Image example document organization (#3410)

This PR aims to improve the organization and readability of our example
documents used in unit tests, specifically focusing on PDF and image
files.

### Summary
- Created two new subdirectories in the `example-docs` folder:
  - `pdf/`: for all PDF example files
  - `img/`: for all image example files
- Moved relevant PDF files from `example-docs/` to `example-docs/pdf/`
- Moved relevant image files from `example-docs/` to `example-docs/img/`
- Updated file paths in affected unit & ingest tests to reflect the new
directory structure

### Testing
All unit & ingest tests should be updated and verified to work with the
new file structure.

## Notes
Other file types (e.g., office documents, HTML files) remain in the root
of `example-docs/` for now.

## Next Steps
Consider similar reorganization for other file types if this structure
proves to be beneficial.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-07-18 15:21:32 -07:00
+								            example_doc_path("pdf/layout-parser-paper-fast.pdf"),
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								            pdf_infer_table_structure=True,
 								            strategy=PartitionStrategy.HI_RES,
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        )
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								        assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: bool):
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								    extract_image_block_types = ["Image", "Table"]
 								    with tempfile.TemporaryDirectory() as tmpdir:
 								        elements = partition(
-												refactor: restructure PDF/Image example document organization (#3410)

This PR aims to improve the organization and readability of our example
documents used in unit tests, specifically focusing on PDF and image
files.

### Summary
- Created two new subdirectories in the `example-docs` folder:
  - `pdf/`: for all PDF example files
  - `img/`: for all image example files
- Moved relevant PDF files from `example-docs/` to `example-docs/pdf/`
- Moved relevant image files from `example-docs/` to `example-docs/img/`
- Updated file paths in affected unit & ingest tests to reflect the new
directory structure

### Testing
All unit & ingest tests should be updated and verified to work with the
new file structure.

## Notes
Other file types (e.g., office documents, HTML files) remain in the root
of `example-docs/` for now.

## Next Steps
Consider similar reorganization for other file types if this structure
proves to be beneficial.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-07-18 15:21:32 -07:00
+								            example_doc_path("pdf/embedded-images-tables.pdf"),
-												Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
											
										
										
											2024-01-04 09:52:00 -08:00
+								            extract_image_block_types=extract_image_block_types,
 								            extract_image_block_to_payload=extract_image_block_to_payload,
 								            extract_image_block_output_dir=tmpdir,
 								        )
 								        assert_element_extraction(
 								            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
 								        )
-												fix: pass extract image args to all partitioners (#3950)

This is needed in order for the user to specify whether to extract the
base64 for images, which are now parsed by the html partitioner.

## Testing

Adds test that validates this by calling the auto-partitioner with
appropriate arguments partitioning an html file with base64 embedded
image.
											
										
										
											2025-03-09 21:15:08 -07:00
+								def test_auto_partition_html_element_extraction():
 								    extract_image_block_types = ["Image"]
 								    with tempfile.TemporaryDirectory() as tmpdir:
 								        elements = partition(
-												feat: support extracting image url in html (#3955)

also removes mimetype when base64 is not included in image metadata

---------

Co-authored-by: ryannikolaidis <ryannikolaidis@users.noreply.github.com>
											
										
										
											2025-03-13 15:41:10 -07:00
+								            example_doc_path("fake-html-with-base64-image.html"),
-												fix: pass extract image args to all partitioners (#3950)

This is needed in order for the user to specify whether to extract the
base64 for images, which are now parsed by the html partitioner.

## Testing

Adds test that validates this by calling the auto-partitioner with
appropriate arguments partitioning an html file with base64 embedded
image.
											
										
										
											2025-03-09 21:15:08 -07:00
+								            extract_image_block_types=extract_image_block_types,
 								            extract_image_block_to_payload=True,
 								        )
 								        assert_element_extraction(elements, extract_image_block_types, True, tmpdir)
-												feat: support extracting image url in html (#3955)

also removes mimetype when base64 is not included in image metadata

---------

Co-authored-by: ryannikolaidis <ryannikolaidis@users.noreply.github.com>
											
										
										
											2025-03-13 15:41:10 -07:00
+								def test_auto_partition_html_image_with_url():
 								    elements = partition(
 								        example_doc_path("fake-html-with-image-from-url.html"),
 								    )
 								    assert elements[1].metadata.image_url is not None
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_partition_pdf_does_not_raise_warning():
 								    # NOTE(robinson): This is the recommended way to check that no warning is emitted,
 								    # per the pytest docs.
 								    # ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
 								    #      #additional-use-cases-of-warnings-in-tests
 								    with warnings.catch_warnings():
 								        warnings.simplefilter("error")
 								        partition(
-												refactor: restructure PDF/Image example document organization (#3410)

This PR aims to improve the organization and readability of our example
documents used in unit tests, specifically focusing on PDF and image
files.

### Summary
- Created two new subdirectories in the `example-docs` folder:
  - `pdf/`: for all PDF example files
  - `img/`: for all image example files
- Moved relevant PDF files from `example-docs/` to `example-docs/pdf/`
- Moved relevant image files from `example-docs/` to `example-docs/img/`
- Updated file paths in affected unit & ingest tests to reflect the new
directory structure

### Testing
All unit & ingest tests should be updated and verified to work with the
new file structure.

## Notes
Other file types (e.g., office documents, HTML files) remain in the root
of `example-docs/` for now.

## Next Steps
Consider similar reorganization for other file types if this structure
proves to be beneficial.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-07-18 15:21:32 -07:00
+								            example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								        )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# PPT
 								# ================================================================================================
-												feat: partition image (#144)

Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
											
										
										
											2023-01-13 22:24:13 -06:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_ppt_from_filename():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("fake-power-point.ppt")
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
 								    assert elements == [
 								        Title(text="Adding a Bullet Slide"),
 								        ListItem(text="Find the bullet slide layout"),
 								        ListItem(text="Use _TextFrame.text for first bullet"),
 								        ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
 								        NarrativeText(text="Here is a lot of text!"),
 								        NarrativeText(text="Here is some text in a text box!"),
 								    ]
 								    assert all(e.metadata.filename == "fake-power-point.ppt" for e in elements)
 								    assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
-												feat: partition image (#144)

Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
											
										
										
											2023-01-13 22:24:13 -06:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# PPTX
 								# ================================================================================================
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
 								def test_auto_partition_pptx_from_filename():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("fake-power-point.pptx")
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
 								    assert elements == [
 								        Title(text="Adding a Bullet Slide"),
 								        ListItem(text="Find the bullet slide layout"),
 								        ListItem(text="Use _TextFrame.text for first bullet"),
 								        ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
 								        NarrativeText(text="Here is a lot of text!"),
 								        NarrativeText(text="Here is some text in a text box!"),
 								    ]
 								    assert all(e.metadata.filename == "fake-power-point.pptx" for e in elements)
 								    assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
-												feat: optional page breaks for `.pptx`, `.pdf`, `.html` and images (#205)

* page breaks for pptx

* added page breaks for image/pdf

* tests for images with page breaks

* page breaks for html documents

* linting, linting, linting

* changelog and bump version

* update docs

* fix typo

* refactor reusable code to common.py

* add type back in
											
										
										
											2023-02-08 10:11:15 -05:00
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								@pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"])
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
+								@pytest.mark.parametrize(
 								    "strategy",
 								    [
 								        PartitionStrategy.AUTO,
 								        PartitionStrategy.FAST,
 								        PartitionStrategy.HI_RES,
 								        PartitionStrategy.OCR_ONLY,
 								    ],
 								)
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								def test_partition_forwards_strategy_arg_to_partition_pptx_and_its_brokers(
 								    request: FixtureRequest, file_name: str, strategy: str
 								):
 								    """The `strategy` arg value received by `partition()` is received by `partition_pptx().
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								    To do this in the brokering-partitioner case (PPT) the strategy argument must make its way to
 								    `partition_ppt()` which must then forward it to `partition_pptx()`. This test makes sure it
 								    made it all the way.
-												fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
											
										
										
											2024-06-21 17:16:39 -07:00
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								    Note this is 2 file-types X 4 strategies = 8 test-cases.
 								    """
-												Feat/pass down strategy to partition ppt as well (#3274)

Following the same pattern of
https://github.com/Unstructured-IO/unstructured/pull/3273 and pass down
`strategy` parameter to `partition_ppt` as well.
											
										
										
											2024-06-21 21:23:58 -05:00
+								    from unstructured.partition.pptx import _PptxPartitioner
 								    def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
 								        yield Text(f"strategy=={self._opts.strategy}")
 								    _iter_elements_ = method_mock(
 								        request,
 								        _PptxPartitioner,
 								        "_iter_presentation_elements",
 								        side_effect=fake_iter_presentation_elements,
 								    )
-												fix(auto): partition() passes strategy to DOC,ODT (#3278)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
											
										
										
											2024-06-25 17:29:47 -07:00
+								    (element,) = partition(example_doc_path(file_name), strategy=strategy)
-												Feat/pass down strategy to partition ppt as well (#3274)

Following the same pattern of
https://github.com/Unstructured-IO/unstructured/pull/3273 and pass down
`strategy` parameter to `partition_ppt` as well.
											
										
										
											2024-06-21 21:23:58 -05:00
 								    _iter_elements_.assert_called_once_with(ANY)
 								    assert element.text == f"strategy=={strategy}"
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# RST
 								# ================================================================================================
-												feat: add `partition_epub` function (#364)

* add pypandoc dependency

* added epub partitioner and file conversion

* test for partition_epub

* tests for file conversion

* add epub to filetype detection

* added epub to auto partition

* update bricks docs

* updated installing docs

* changelot and version

* add pandoc to dependencies

* add pandoc to debian dependencies

* linting, linting, linting

* typo fix

* typo fix

* file conversion type hints

* more type hints

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-03-14 11:52:21 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_rst_from_filename():
 								    elements = partition(example_doc_path("README.rst"))
-												feat: add `partition_epub` function (#364)

* add pypandoc dependency

* added epub partitioner and file conversion

* test for partition_epub

* tests for file conversion

* add epub to filetype detection

* added epub to auto partition

* update bricks docs

* updated installing docs

* changelot and version

* add pandoc to dependencies

* add pandoc to debian dependencies

* linting, linting, linting

* typo fix

* typo fix

* file conversion type hints

* more type hints

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-03-14 11:52:21 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/x-rst"
-												feat: add `partition_epub` function (#364)

* add pypandoc dependency

* added epub partitioner and file conversion

* test for partition_epub

* tests for file conversion

* add epub to filetype detection

* added epub to auto partition

* update bricks docs

* updated installing docs

* changelot and version

* add pandoc to dependencies

* add pandoc to debian dependencies

* linting, linting, linting

* typo fix

* typo fix

* file conversion type hints

* more type hints

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-03-14 11:52:21 -04:00
-												feat: add `partition_msg` for MSFT Outlook files (#412)

* added msg-parser dependency

* pass through kwargs in convert_file_to_text

* added partition_msg for processing msft outlook files

* version bump and changelog

* added tests for partition_msg

* added test for msg with plain text

* add partition_msg docs; fix underlines in integration docs

* add .msg to file list

* finish tests for auto msg

* linting, linting, linting
											
										
										
											2023-03-28 16:15:22 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_rst_from_file():
 								    with open(example_doc_path("README.rst"), "rb") as f:
 								        elements = partition(file=f, content_type="text/x-rst")
-												feat: add `partition_msg` for MSFT Outlook files (#412)

* added msg-parser dependency

* pass through kwargs in convert_file_to_text

* added partition_msg for processing msft outlook files

* version bump and changelog

* added tests for partition_msg

* added test for msg with plain text

* add partition_msg docs; fix underlines in integration docs

* add .msg to file list

* finish tests for auto msg

* linting, linting, linting
											
										
										
											2023-03-28 16:15:22 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/x-rst"
-												feat: add `partition_msg` for MSFT Outlook files (#412)

* added msg-parser dependency

* pass through kwargs in convert_file_to_text

* added partition_msg for processing msft outlook files

* version bump and changelog

* added tests for partition_msg

* added test for msg with plain text

* add partition_msg docs; fix underlines in integration docs

* add .msg to file list

* finish tests for auto msg

* linting, linting, linting
											
										
										
											2023-03-28 16:15:22 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# RTF
 								# ================================================================================================
-												feat: add `partition_rtf` for rich text files (#466)

* refactor epub; add rtf

* added test for rtf files

* filetype detection for rtf files

* add rtf to auto

* update docs for group_broken_paragraphs

* add rtf to docs

* update file list in readme

* update stage_for_transformers docs

* changelog and version bump

* skip rtf if in docker

* skip test if rtf not supported

* docs tweaks
											
										
										
											2023-04-10 17:25:03 -04:00
 								def test_auto_partition_rtf_from_filename():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(example_doc_path("fake-doc.rtf"), strategy=PartitionStrategy.HI_RES)
-												feat: add `partition_rtf` for rich text files (#466)

* refactor epub; add rtf

* added test for rtf files

* filetype detection for rtf files

* add rtf to auto

* update docs for group_broken_paragraphs

* add rtf to docs

* update file list in readme

* update stage_for_transformers docs

* changelog and version bump

* skip rtf if in docker

* skip test if rtf not supported

* docs tweaks
											
										
										
											2023-04-10 17:25:03 -04:00
+								    assert elements[0] == Title("My First Heading")
-												feat: add `url` kwarg to `partititon` (#470)

* added url option to auto partition

* add test for partition from url

* version and changelog

* update docs

* add url to element metadata
											
										
										
											2023-04-12 14:31:01 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# TSV
 								# ================================================================================================
-												feat: allow headers in `partition` (#473)

* feat: allow headers in `partition`

* warning if header is set and url is not

* update emoji test
											
										
										
											2023-04-13 11:04:15 -04:00
-												fix: parse URL response Content-Type according to RFC 9110 (#2950)

Currently, `file_and_type_from_url()` does not correctly handle the
`Content-Type` header. Specifically, it assumes that the header contains
only the mime-type (e.g. `text/html`), however, [RFC
9110](https://www.rfc-editor.org/rfc/rfc9110#field.content-type) allows
for additional directives — specifically the `charset` — to be returned
in the header. This leads to a `ValueError` when loading a URL with a
response Content-Type header such as `text/html; charset=UTF-8`.

To reproduce the issue:

```python
from unstructured.partition.auto import partition

url = "https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/"
partition(url=url)
```

Which will result in the following exception:

```python
{
	"name": "ValueError",
	"message": "Invalid file. The FileType.UNK file type is not supported in partition.",
	"stack": "---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[1], line 4
      1 from unstructured.partition.auto import partition
      3 url = \"https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/\"
----> 4 partition(url=url)

File ~/miniconda3/envs/ai-tasks/lib/python3.11/site-packages/unstructured/partition/auto.py:541, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, hi_res_model_name, model_name, date_from_file_object, starting_page_number, **kwargs)
    539 else:
    540     msg = \"Invalid file\" if not filename else f\"Invalid file {filename}\"
--> 541     raise ValueError(f\"{msg}. The {filetype} file type is not supported in partition.\")
    543 for element in elements:
    544     element.metadata.url = url

ValueError: Invalid file. The FileType.UNK file type is not supported in partition."
}
```

This PR fixes the issue by parsing the mime-type out of the
`Content-Type` header string.


Closes #2257
											
										
										
											2024-04-30 07:53:44 +02:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_tsv_from_filename():
 								    elements = partition(example_doc_path("stanley-cups.tsv"))
-												fix: parse URL response Content-Type according to RFC 9110 (#2950)

Currently, `file_and_type_from_url()` does not correctly handle the
`Content-Type` header. Specifically, it assumes that the header contains
only the mime-type (e.g. `text/html`), however, [RFC
9110](https://www.rfc-editor.org/rfc/rfc9110#field.content-type) allows
for additional directives — specifically the `charset` — to be returned
in the header. This leads to a `ValueError` when loading a URL with a
response Content-Type header such as `text/html; charset=UTF-8`.

To reproduce the issue:

```python
from unstructured.partition.auto import partition

url = "https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/"
partition(url=url)
```

Which will result in the following exception:

```python
{
	"name": "ValueError",
	"message": "Invalid file. The FileType.UNK file type is not supported in partition.",
	"stack": "---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[1], line 4
      1 from unstructured.partition.auto import partition
      3 url = \"https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/\"
----> 4 partition(url=url)

File ~/miniconda3/envs/ai-tasks/lib/python3.11/site-packages/unstructured/partition/auto.py:541, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, hi_res_model_name, model_name, date_from_file_object, starting_page_number, **kwargs)
    539 else:
    540     msg = \"Invalid file\" if not filename else f\"Invalid file {filename}\"
--> 541     raise ValueError(f\"{msg}. The {filetype} file type is not supported in partition.\")
    543 for element in elements:
    544     element.metadata.url = url

ValueError: Invalid file. The FileType.UNK file type is not supported in partition."
}
```

This PR fixes the issue by parsing the mime-type out of the
`Content-Type` header string.


Closes #2257
											
										
										
											2024-04-30 07:53:44 +02:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 								    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
 								    assert elements[0].metadata.filetype == "text/tsv"
-												fix: parse URL response Content-Type according to RFC 9110 (#2950)

Currently, `file_and_type_from_url()` does not correctly handle the
`Content-Type` header. Specifically, it assumes that the header contains
only the mime-type (e.g. `text/html`), however, [RFC
9110](https://www.rfc-editor.org/rfc/rfc9110#field.content-type) allows
for additional directives — specifically the `charset` — to be returned
in the header. This leads to a `ValueError` when loading a URL with a
response Content-Type header such as `text/html; charset=UTF-8`.

To reproduce the issue:

```python
from unstructured.partition.auto import partition

url = "https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/"
partition(url=url)
```

Which will result in the following exception:

```python
{
	"name": "ValueError",
	"message": "Invalid file. The FileType.UNK file type is not supported in partition.",
	"stack": "---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[1], line 4
      1 from unstructured.partition.auto import partition
      3 url = \"https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/\"
----> 4 partition(url=url)

File ~/miniconda3/envs/ai-tasks/lib/python3.11/site-packages/unstructured/partition/auto.py:541, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, hi_res_model_name, model_name, date_from_file_object, starting_page_number, **kwargs)
    539 else:
    540     msg = \"Invalid file\" if not filename else f\"Invalid file {filename}\"
--> 541     raise ValueError(f\"{msg}. The {filetype} file type is not supported in partition.\")
    543 for element in elements:
    544     element.metadata.url = url

ValueError: Invalid file. The FileType.UNK file type is not supported in partition."
}
```

This PR fixes the issue by parsing the mime-type out of the
`Content-Type` header string.


Closes #2257
											
										
										
											2024-04-30 07:53:44 +02:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# TXT
 								# ================================================================================================
-												Fix: partition on empty or whitespace-only text files (#3675)

This is a fix for this
[bug](https://github.com/Unstructured-IO/unstructured/issues/3674), auto partition fails on text files which are empty or contain only whitespaces

Inference of .txt file type fails if the file has only whitespaces.

To Reproduce:

```
from tempfile import NamedTemporaryFile

from unstructured.partition.auto import partition

with NamedTemporaryFile(mode="w", suffix=".txt") as f:
    f.write("   \n")
    f.seek(0)
    elements = partition(filename=f.name)
```
											
										
										
											2024-09-29 06:16:33 +02:00
+								@pytest.mark.parametrize(
 								    ("filename", "expected_elements"),
 								    [
 								        (
 								            "fake-text.txt",
 								            [
 								                NarrativeText(text="This is a test document to use for unit tests."),
 								                Address(text="Doylestown, PA 18901"),
 								                Title(text="Important points:"),
 								                ListItem(text="Hamburgers are delicious"),
 								                ListItem(text="Dogs are the best"),
 								                ListItem(text="I love fuzzy blankets"),
 								            ],
 								        ),
 								        ("fake-text-all-whitespace.txt", []),
 								    ],
 								)
 								def test_auto_partition_text_from_filename(filename: str, expected_elements: list[Element]):
 								    file_path = example_doc_path(filename)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												Fix: partition on empty or whitespace-only text files (#3675)

This is a fix for this
[bug](https://github.com/Unstructured-IO/unstructured/issues/3674), auto partition fails on text files which are empty or contain only whitespaces

Inference of .txt file type fails if the file has only whitespaces.

To Reproduce:

```
from tempfile import NamedTemporaryFile

from unstructured.partition.auto import partition

with NamedTemporaryFile(mode="w", suffix=".txt") as f:
    f.write("   \n")
    f.seek(0)
    elements = partition(filename=f.name)
```
											
										
										
											2024-09-29 06:16:33 +02:00
+								    assert elements == expected_elements
 								    assert all(e.metadata.filename == filename for e in elements)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
-												fix: update `detect_filetype` for JSONs with text/plain MIME type (#520)

* check to see if text file is a json

* add json check into filetype detection

* added test for updated file detection logic

* bytes/strings handling

* changlog and version bump
											
										
										
											2023-04-26 13:52:47 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_text_from_file():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    with open(example_doc_path("fake-text.txt"), "rb") as f:
-												Refactor: partition pdf (#2074)

### Summary
- add constants for strategies
- add `_process_uncategorized_text_elements()` to remove code block
duplication
### Testing
CI should pass.
											
										
										
											2023-11-15 21:41:02 -08:00
+								        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert len(elements) > 0
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert elements == [
 								        NarrativeText(text="This is a test document to use for unit tests."),
 								        Address(text="Doylestown, PA 18901"),
 								        Title(text="Important points:"),
 								        ListItem(text="Hamburgers are delicious"),
 								        ListItem(text="Dogs are the best"),
 								        ListItem(text="I love fuzzy blankets"),
 								    ]
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# XLS
 								# ================================================================================================
-												Introduce `start_page` argument to partitioning functions that assign `element.metadata.page_number` (#2884)

This small change will be useful for users who partition only fragments
of their PDF documents.
It's a small step towards addressing this issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

Related PRs:
* https://github.com/Unstructured-IO/unstructured/pull/2842
* https://github.com/Unstructured-IO/unstructured/pull/2673
											
										
										
											2024-04-15 23:03:42 +02:00
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_xls_from_filename():
 								    elements = partition(
 								        example_doc_path("tests-example.xls"), include_header=False, skip_infer_table_types=[]
 								    )
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
-												fix(xlsx): xlsx subtable algorithm (#2534)

**Reviewers:** It may be easier to review each of the two commits
separately. The first adds the new `_SubtableParser` object with its
unit-tests and the second one uses that object to replace the flawed
existing subtable-parsing algorithm.

**Summary**

There are a cluster of bugs in `partition_xlsx()` that all derive from
flaws in the algorithm we use to detect "subtables". These are
encountered when the user wants to get multiple document-elements from
each worksheet, which is the default (argument `find_subtable = True`).

This PR replaces the flawed existing algorithm with a `_SubtableParser`
object that encapsulates all that logic and has thorough unit-tests.

**Additional Context**

This is a summary of the failure cases. There are a few other cases but
they're closely related and this was enough evidence and scope for my
purposes. This PR fixes all these bugs:
```python
    #
    # -- ✅ CASE 1: There are no leading or trailing single-cell rows.
    #       -> this subtable functions never get called, subtable is emitted as the only element
    #
    #    a b  -> Table(a, b, c, d)
    #    c d

    # -- ✅ CASE 2: There is exactly one leading single-cell row.
    #       -> Leading single-cell row emitted as `Title` element, core-table properly identified.
    #
    #    a    -> [ Title(a),
    #    b c       Table(b, c, d, e) ]
    #    d e

    # -- ❌ CASE 3: There are two-or-more leading single-cell rows.
    #       -> leading single-cell rows are included in subtable
    #
    #    a    -> [ Table(a, b, c, d, e, f) ]
    #    b
    #    c d
    #    e f

    # -- ❌ CASE 4: There is exactly one trailing single-cell row.
    #      -> core table is dropped. trailing single-cell row is emitted as Title
    #         (this is the behavior in the reported bug)
    #
    #    a b  -> [ Title(e) ]
    #    c d
    #      e

    # -- ❌ CASE 5: There are two-or-more trailing single-cell rows.
    #      -> core table is dropped. trailing single-cell rows are each emitted as a Title
    #
    #    a b  -> [ Title(e),
    #    c d       Title(f) ]
    #      e
    #      f

    # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows.
    #      -> core table is correctly identified, leading and trailing single-cell rows are each
    #         emitted as a Title.
    #
    #      a  -> [ Title(a),
    #    b c       Table(b, c, d, e),
    #    d e       Title(f) ]
    #    f

    # -- ✅ CASE 7: There are two leading and one trailing single-cell rows.
    #      -> core table is correctly identified, leading and trailing single-cell rows are each
    #         emitted as a Title.
    #
    #    a    -> [ Title(a),
    #    b         Title(b),
    #    c d       Table(c, d, e, f),
    #    e f       Title(g) ]
    #      g

    # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows.
    #      -> core table is correctly identified, leading and trailing single-cell rows are each
    #         emitted as a Title.
    #
    #      a  -> [ Title(a),
    #      b       Title(b),
    #    c d       Table(c, d, e, f),
    #    e f       Title(g),
    #    g         Title(h) ]
    #    h

    # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below.
    #      -> First cell is mistakenly emitted as title, remaining cells are dropped.
    #
    #    a b c  -> [ Title(a) ]

    # -- ❌ CASE 10: Single-row subtable with one leading single-cell row.
    #      -> Leading single-row cell is correctly identified as title, core-table is mis-identified
    #         as a `Title` and truncated.
    #
    #    a      -> [ Title(a),
    #    b c d       Title(b) ]
```
											
										
										
											2024-02-13 20:29:17 -08:00
+								    assert len(elements) == 14
-												fix(xlsx): XLSX emits std minified .text_as_html (#3558)

**Summary**
Eliminate historical "idiosyncracies" of `table.metadata.text_as_html`
HTML introduced by `partition_xlsx()`. Produce minified `.text_as_html`
consistent with that formed by chunking.

**Additional Context**
- XLSX `.text_as_html` is minified (no extra whitespace or thead, tbody,
tfoot elements).
- `table.text` is clean-concatenated-text (CCT) of table.

---------

Co-authored-by: scanny <scanny@users.noreply.github.com>
											
										
										
											2024-10-17 15:05:11 -07:00
+								    assert sum(isinstance(e, Table) for e in elements) == 2
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
+								    assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE
-												fix(xlsx): XLSX emits std minified .text_as_html (#3558)

**Summary**
Eliminate historical "idiosyncracies" of `table.metadata.text_as_html`
HTML introduced by `partition_xlsx()`. Produce minified `.text_as_html`
consistent with that formed by chunking.

**Additional Context**
- XLSX `.text_as_html` is minified (no extra whitespace or thead, tbody,
tfoot elements).
- `table.text` is clean-concatenated-text (CCT) of table.

---------

Co-authored-by: scanny <scanny@users.noreply.github.com>
											
										
										
											2024-10-17 15:05:11 -07:00
+								    assert len(elements[0].text) == 507
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# XLSX
 								# ================================================================================================
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_xlsx_from_filename():
 								    elements = partition(
 								        example_doc_path("stanley-cups.xlsx"), include_header=False, skip_infer_table_types=[]
 								    )
-												Chore: Pass table support  param to partition image (#973)

* add param and test in image table extraction

* version and changelog

* need to publish this one for api repo

* add new param skip_infer_table_types

* use warning

* clean up with mapping

* add test for tsv

* fix test fail

* weird change from merge

* doc nit

* don't use mapping

* correct conflict
											
										
										
											2023-07-27 13:33:36 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert len(elements) == 4
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert sum(isinstance(e, Table) for e in elements) == 2
 								    assert sum(isinstance(e, Title) for e in elements) == 2
 								    assert clean_extra_whitespace(elements[0].text) == "Stanley Cups"
 								    assert clean_extra_whitespace(elements[1].text) == (
 								        "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
 								    )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert all(e.metadata.page_number == 1 for e in elements[:2])
 								    assert all(e.metadata.page_number == 2 for e in elements[2:])
 								    assert all(
 								        e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 								        for e in elements
 								    )
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
-												enhancement: handling for empty files in `detect_filetype` and `partition` (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
											
										
										
											2023-06-09 16:07:50 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_xlsx_from_file():
 								    with open(example_doc_path("stanley-cups.xlsx"), "rb") as f:
 								        elements = partition(file=f, include_header=False, skip_infer_table_types=[])
-												enhancement: handling for empty files in `detect_filetype` and `partition` (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
											
										
										
											2023-06-09 16:07:50 -04:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert len(elements) == 4
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert sum(isinstance(element, Table) for element in elements) == 2
 								    assert sum(isinstance(element, Title) for element in elements) == 2
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert clean_extra_whitespace(elements[0].text) == "Stanley Cups"
 								    assert clean_extra_whitespace(elements[1].text) == (
 								        "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
 								    )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert all(e.metadata.page_number == 1 for e in elements[:2])
 								    assert all(e.metadata.page_number == 2 for e in elements[2:])
 								    assert all(
 								        e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 								        for e in elements
 								    )
-												feature(html partition): parse pre tag (#642)

* feature(html partition): parse pre tag

* chore: update CHANGELOG.md

* style: black format xml.py

* Added tests dor html with pre tag

* remove skip test, update parse pre tag

* fix style

* chore: spell check

* chore: update changelog & version

* chore: update ingest test fixtures

* chore: add exception handling if `element.text` is `None` in `_read_xml`

* test: add more sanity testing on the `.text` content of the element(s)

* refactor: move the conditional logic for <pre> outside of the `try/except` block

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
											
										
										
											2023-06-27 21:52:39 +03:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_xlsx_respects_starting_page_number_argument():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(example_doc_path("stanley-cups.xlsx"), starting_page_number=3)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert all(e.metadata.page_number == 3 for e in elements[:2])
 								    assert all(e.metadata.page_number == 4 for e in elements[2:])
-												enhancement: handling for empty files in `detect_filetype` and `partition` (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
											
										
										
											2023-06-09 16:07:50 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# XML
 								# ================================================================================================
-												feat: `partition_rst` for ReStructured Text documents (#725)

* add example rst file

* filetype detection for rst files

* add partition_rst function

* add partition_rst to auto

* update readme

* update docs

* changelog and version

* pandocs -> pandoc

* fix typo
											
										
										
											2023-06-12 15:31:10 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_xml_from_filename():
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=False)
-												feat: partition_org for Org Mode documents (#780)

* feat: partition_org for Org Mode documents

* update version
											
										
										
											2023-06-23 20:45:31 +02:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[0].text == "United States"
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert all(e.metadata.filename == "factbook.xml" for e in elements)
-												feat: partition_org for Org Mode documents (#780)

* feat: partition_org for Org Mode documents

* update version
											
										
										
											2023-06-23 20:45:31 +02:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_xml_from_file():
 								    with open(example_doc_path("factbook.xml"), "rb") as f:
 								        elements = partition(file=f, xml_keep_tags=False)
-												feat: partition_org for Org Mode documents (#780)

* feat: partition_org for Org Mode documents

* update version
											
										
										
											2023-06-23 20:45:31 +02:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[0].text == "United States"
-												feat: partition_org for Org Mode documents (#780)

* feat: partition_org for Org Mode documents

* update version
											
										
										
											2023-06-23 20:45:31 +02:00
-												feat: `partition_rst` for ReStructured Text documents (#725)

* add example rst file

* filetype detection for rst files

* add partition_rst function

* add partition_rst to auto

* update readme

* update docs

* changelog and version

* pandocs -> pandoc

* fix typo
											
										
										
											2023-06-12 15:31:10 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_xml_from_filename_with_tags():
 								    elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=True)
-												feat: `partition_rst` for ReStructured Text documents (#725)

* add example rst file

* filetype detection for rst files

* add partition_rst function

* add partition_rst to auto

* update readme

* update docs

* changelog and version

* pandocs -> pandoc

* fix typo
											
										
										
											2023-06-12 15:31:10 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert "<leader>Joe Biden</leader>" in elements[0].text
 								    assert elements[0].metadata.filename == "factbook.xml"
-												feat: `partition_rst` for ReStructured Text documents (#725)

* add example rst file

* filetype detection for rst files

* add partition_rst function

* add partition_rst to auto

* update readme

* update docs

* changelog and version

* pandocs -> pandoc

* fix typo
											
										
										
											2023-06-12 15:31:10 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_xml_from_file_with_tags():
 								    with open(example_doc_path("factbook.xml"), "rb") as f:
 								        elements = partition(file=f, xml_keep_tags=True)
-												fixed filename metadata bug when using file and file_filename (#1002)


											
										
										
											2023-08-02 18:14:15 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert "<leader>Joe Biden</leader>" in elements[0].text
-												fixed filename metadata bug when using file and file_filename (#1002)


											
										
										
											2023-08-02 18:14:15 -07:00
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# FILE_TYPE NOT RECOGNIZED OR NOT SUPPORTED
 								# ================================================================================================
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
 								    detect_filetype_ = function_mock(
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
+								        request, "unstructured.partition.auto.detect_filetype", return_value=FileType.UNK
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    )
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
-												rfctr(email): eml partitioner rewrite (#3694)

**Summary**
Initial attempts to incrementally refactor `partition_email()` into
shape to allow pluggable partitioning quickly became too complex for
ready code-review. Prepare separate rewritten module and tests and swap
them out whole.

**Additional Context**
- Uses the modern stdlib `email` module to reliably accomplish several
manual decoding steps in the legacy code.
- Remove obsolete email-specific element-types which were replaced 18
months or so ago with email-specific metadata fields for things like Cc:
addresses, subject, etc.
- Remove accepting an email as `text: str` because MIME-email is
inherently a binary format which can and often does contain multiple and
contradictory character-encodings.
- Remove `encoding` parameters as it is now unused. An email file is not
a text file and as such does not have a single overall encoding.
Character encoding is specified individually for each MIME-part within
the message and often varies from one part to another in the same
message.
- Remove the need for a caller to specify `attachment_partitioner`.
There is only one reasonable choice for this which is
`auto.partition()`, consistent with the same interface and operation in
`partition_msg()`.
- Fixes #3671 along the way by silently skipping attachments with a
file-type for which there is no partitioner.
- Substantially extend the test-suite to cover multiple
transport-encoding/charset combinations.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: scanny <scanny@users.noreply.github.com>
											
										
										
											2024-10-15 19:02:33 -07:00
+								    with pytest.raises(
 								        UnsupportedFileFormatError,
-												rfctr: prep for pluggable partitioners (#3806)

**Summary**
Prepare auto-partitioning for pluggable partitioners.

Move toward a uniform partitioner call signature in `auto/partition()`
such that a custom or override partitioner can be registered without
requiring code changes.

**Additional Context**
The central job of `auto/partition()` is to detect the file-type of the
given file and use that to dispatch partitioning to the corresponding
partitioner function e.g. `partition_pdf()` or `partition_docx()`.

In the existing code, each partitioner function is called with
parameters "hand-picked" from the available parameters passed to the
`partition()` function. This is unnecessary and couples those
partitioners tightly with the dispatch function. The desired state is
that all available arguments are passed as `kwargs` and the partitioner
function "self-selects" the arguments it will be sensitive to, applies
its own appropriate default values when the argument is omitted, and
simply ignore any arguments it doesn't use. Note that achieving this
requires no changes to partitioner functions because they already do
precisely this.

So the job is to pass all arguments (other than `filename` and `file`)
to the partitioner as `kwargs`. This will allow additional or alternate
partitioners to be registered at runtime and dispatched to, because as
long as they have the signature `partition_x(filename, file, kwargs) ->
list[Element]` then they can be dispatched to without customization.
											
										
										
											2024-12-10 12:44:34 -08:00
+								        match="Partitioning is not supported for the FileType.UNK file type.",
-												rfctr(email): eml partitioner rewrite (#3694)

**Summary**
Initial attempts to incrementally refactor `partition_email()` into
shape to allow pluggable partitioning quickly became too complex for
ready code-review. Prepare separate rewritten module and tests and swap
them out whole.

**Additional Context**
- Uses the modern stdlib `email` module to reliably accomplish several
manual decoding steps in the legacy code.
- Remove obsolete email-specific element-types which were replaced 18
months or so ago with email-specific metadata fields for things like Cc:
addresses, subject, etc.
- Remove accepting an email as `text: str` because MIME-email is
inherently a binary format which can and often does contain multiple and
contradictory character-encodings.
- Remove `encoding` parameters as it is now unused. An email file is not
a text file and as such does not have a single overall encoding.
Character encoding is specified individually for each MIME-part within
the message and often varies from one part to another in the same
message.
- Remove the need for a caller to specify `attachment_partitioner`.
There is only one reasonable choice for this which is
`auto.partition()`, consistent with the same interface and operation in
`partition_msg()`.
- Fixes #3671 along the way by silently skipping attachments with a
file-type for which there is no partitioner.
- Substantially extend the test-suite to cover multiple
transport-encoding/charset combinations.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: scanny <scanny@users.noreply.github.com>
											
										
										
											2024-10-15 19:02:33 -07:00
+								    ):
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    detect_filetype_.assert_called_once_with(
-												rfctr(file): refactor detect_filetype() (#3429)

**Summary**
In preparation for fixing a cluster of bugs with automatic file-type
detection and paving the way for some reliability improvements, refactor
`unstructured.file_utils.filetype` module and improve thoroughness of
tests.

**Additional Context**
Factor type-recognition process into three distinct strategies that are
attempted in sequence. Attempted in order of preference,
type-recognition falls to the next strategy when the one before it is
not applicable or cannot determine the file-type. This provides a clear
basis for organizing the code and tests at the top level.

Consolidate the existing tests around these strategies, adding
additional cases to achieve better coverage.

Several bugs were uncovered in the process. Small ones were just fixed,
bigger ones will be remedied in following PRs.
											
										
										
											2024-07-23 16:18:48 -07:00
+								        file_path="made-up.fake",
 								        file=None,
 								        encoding=None,
 								        content_type=None,
 								        metadata_file_path=None,
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    )
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# LOAD FROM URL
 								# ================================================================================================
 								def test_auto_partition_from_url():
 								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    elements = partition(url=url, content_type="text/plain", strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[0] == Title("Apache License")
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert all(e.metadata.url == url for e in elements)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								def test_auto_partition_from_url_with_rfc9110_content_type():
 								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    elements = partition(
 								        url=url, content_type="text/plain; charset=utf-8", strategy=PartitionStrategy.HI_RES
 								    )
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[0] == Title("Apache License")
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert all(e.metadata.url == url for e in elements)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								def test_auto_partition_from_url_without_providing_content_type():
 								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    elements = partition(url=url, strategy=PartitionStrategy.HI_RES)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert elements[0] == Title("Apache License")
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert all(e.metadata.url == url for e in elements)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								def test_auto_partition_warns_if_header_set_and_not_url(caplog: LogCaptureFixture):
 								    partition(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        example_doc_path("eml/fake-email.eml"),
 								        headers={"Accept": "application/pdf"},
 								        strategy=PartitionStrategy.HI_RES,
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    )
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert caplog.records[0].levelname == "WARNING"
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert "headers kwarg is set but the url kwarg is not. The headers kwarg will b" in caplog.text
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_from_url_routes_timeout_to_HTTP_request(request: FixtureRequest):
 								    file_and_type_from_url_ = function_mock(
 								        request,
 								        "unstructured.partition.auto.file_and_type_from_url",
 								        side_effect=ConnectionError("Trouble on the wire ..."),
 								    )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    with pytest.raises(ConnectionError, match="Trouble on the wire ..."):
 								        partition(url="http://eie.io", request_timeout=326)
 								    file_and_type_from_url_.assert_called_once_with(
 								        url="http://eie.io", content_type=None, headers={}, ssl_verify=True, request_timeout=326
 								    )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								# ================================================================================================
 								# OTHER ARGS
 								# ================================================================================================
 								# -- chunking_strategy ----------------------------------------------------
-												chunk_by_title decorator (#1304)

### Summary

Partial solution to #1185.
Related to #1222.
Creates decorator from `chunk_by_title` cleaning brick.
Breaks a document into sections based on the presence of Title elements.
Also starts a new section under the following conditions:

- If metadata changes, indicating a change in section or page or a
switch to processing attachments. If `multipage_sections=True`, sections
can span pages. `multipage_sections` defaults to True.
- If the length of the section exceeds `new_after_n_chars` characters.
The default is 1500. The **chunking function does not split individual
elements**, so it's possible for a section to exceed that threshold if
an individual element if over `new_after_n_chars characters`, which
could occur with a long NarrativeText element.

Combines sections under these conditions
- Sections under `combine_under_n_chars` characters are combined. The
default is 500.

### Testing

from unstructured.partition.html import partition_html

url = "https://understandingwar.org/backgrounder/russian-offensive-campaign-assessment-august-27-2023-0"
chunks = partition_html(url=url, chunking_strategy="by_title")

for chunk in chunks:
    print(chunk)
    print("\n\n" + "-"*80)
    input()

											
										
										
											2023-09-11 16:00:14 -05:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_forwards_chunking_strategy_via_kwargs():
 								    chunks = partition(example_doc_path("example-10k-1p.html"), chunking_strategy="by_title")
 								    assert all(isinstance(chunk, (CompositeElement, Table, TableChunk)) for chunk in chunks)
-												chore: Table chunking (#1540)

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
											
										
										
											2023-10-03 09:40:34 -07:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_forwards_max_characters_via_kwargs():
 								    chunks = partition(
 								        example_doc_path("example-10k-1p.html"),
 								        chunking_strategy="by_title",
 								        max_characters=250,
 								    )
 								    assert all(len(chunk.text) <= 250 for chunk in chunks)
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# -- detect_language_per_element ------------------------------------------
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_respects_detect_language_per_element_arg():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(
 								        example_doc_path("language-docs/eng_spa_mult.txt"), detect_language_per_element=True
 								    )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    langs = [element.metadata.languages for element in elements]
 								    assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
 								# -- languages ------------------------------------------------------------
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
 								@pytest.mark.parametrize(
-												Feat/bump numpy to 2 (#3961)

This PR updates a few dependencies so that they are compatible with
`numpy>=2`.
											
										
										
											2025-03-18 16:33:48 -05:00
+								    "file_extension",
 								    [
 								        "doc",
 								        "docx",
 								        "eml",
 								        "epub",
 								        "html",
 								        "md",
 								        "odt",
 								        "org",
 								        "ppt",
 								        "pptx",
 								        "rst",
 								        "rtf",
 								        "txt",
 								        "xml",
 								    ],
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
+								)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_respects_language_arg(file_extension: str):
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    elements = partition(
 								        example_doc_path(f"language-docs/eng_spa_mult.{file_extension}"), languages=["deu"]
 								    )
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
+								    assert all(element.metadata.languages == ["deu"] for element in elements)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# -- include_page_breaks --------------------------------------------------
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_forwards_include_page_breaks_to_partition_pdf():
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    elements = partition(
-												refactor: restructure PDF/Image example document organization (#3410)

This PR aims to improve the organization and readability of our example
documents used in unit tests, specifically focusing on PDF and image
files.

### Summary
- Created two new subdirectories in the `example-docs` folder:
  - `pdf/`: for all PDF example files
  - `img/`: for all image example files
- Moved relevant PDF files from `example-docs/` to `example-docs/pdf/`
- Moved relevant image files from `example-docs/` to `example-docs/img/`
- Updated file paths in affected unit & ingest tests to reflect the new
directory structure

### Testing
All unit & ingest tests should be updated and verified to work with the
new file structure.

## Notes
Other file types (e.g., office documents, HTML files) remain in the root
of `example-docs/` for now.

## Next Steps
Consider similar reorganization for other file types if this structure
proves to be beneficial.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-07-18 15:21:32 -07:00
+								        example_doc_path("pdf/layout-parser-paper-fast.pdf"),
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        include_page_breaks=True,
 								        strategy=PartitionStrategy.HI_RES,
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    )
 								    assert "PageBreak" in [elem.category for elem in elements]
 								# -- metadata_filename ----------------------------------------------------
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_forwards_metadata_filename_via_kwargs():
 								    with open(example_doc_path("fake-text.txt"), "rb") as f:
 								        elements = partition(file=f, metadata_filename="much-more-interesting-name.txt")
 								    assert all(e.metadata.filename == "much-more-interesting-name.txt" for e in elements)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								# -- ocr_languages --------------------------------------------------------
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_image_formats_languages_for_tesseract(request: FixtureRequest):
 								    process_file_with_ocr_ = function_mock(
 								        request, "unstructured.partition.pdf_image.ocr.process_file_with_ocr"
 								    )
 								    partition(
-												refactor: restructure PDF/Image example document organization (#3410)

This PR aims to improve the organization and readability of our example
documents used in unit tests, specifically focusing on PDF and image
files.

### Summary
- Created two new subdirectories in the `example-docs` folder:
  - `pdf/`: for all PDF example files
  - `img/`: for all image example files
- Moved relevant PDF files from `example-docs/` to `example-docs/pdf/`
- Moved relevant image files from `example-docs/` to `example-docs/img/`
- Updated file paths in affected unit & ingest tests to reflect the new
directory structure

### Testing
All unit & ingest tests should be updated and verified to work with the
new file structure.

## Notes
Other file types (e.g., office documents, HTML files) remain in the root
of `example-docs/` for now.

## Next Steps
Consider similar reorganization for other file types if this structure
proves to be beneficial.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-07-18 15:21:32 -07:00
+								        example_doc_path("img/chi_sim_image.jpeg"),
 								        strategy=PartitionStrategy.HI_RES,
 								        languages=["zh"],
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    )
 								    call_kwargs = process_file_with_ocr_.call_args_list[0][1]
 								    assert call_kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								@pytest.mark.parametrize(("languages", "ocr_languages"), [(["auto"], ""), (["eng"], "")])
 								def test_auto_partition_ignores_empty_string_for_ocr_languages(
 								    languages: list[str], ocr_languages: str
 								):
 								    elements = partition(
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        example_doc_path("book-war-and-peace-1p.txt"),
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        strategy=PartitionStrategy.OCR_ONLY,
 								        ocr_languages=ocr_languages,
 								        languages=languages,
 								    )
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert all(e.metadata.languages == ["eng"] for e in elements)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								def test_auto_partition_warns_with_ocr_languages(caplog: LogCaptureFixture):
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    partition(
-												refactor: restructure PDF/Image example document organization (#3410)

This PR aims to improve the organization and readability of our example
documents used in unit tests, specifically focusing on PDF and image
files.

### Summary
- Created two new subdirectories in the `example-docs` folder:
  - `pdf/`: for all PDF example files
  - `img/`: for all image example files
- Moved relevant PDF files from `example-docs/` to `example-docs/pdf/`
- Moved relevant image files from `example-docs/` to `example-docs/img/`
- Updated file paths in affected unit & ingest tests to reflect the new
directory structure

### Testing
All unit & ingest tests should be updated and verified to work with the
new file structure.

## Notes
Other file types (e.g., office documents, HTML files) remain in the root
of `example-docs/` for now.

## Next Steps
Consider similar reorganization for other file types if this structure
proves to be beneficial.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-07-18 15:21:32 -07:00
+								        example_doc_path("pdf/chevron-page.pdf"),
 								        strategy=PartitionStrategy.HI_RES,
 								        ocr_languages="eng",
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    )
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
 								    assert caplog.records[0].levelname == "WARNING"
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert "The ocr_languages kwarg will be deprecated" in caplog.text
 								# -- skip_infer_table_types -----------------------------------------------
 								@pytest.mark.parametrize(
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    ("skip_infer_table_types", "filename", "has_text_as_html"),
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    [
 								        (["xlsx"], "stanley-cups.xlsx", False),
 								        ([], "stanley-cups.xlsx", True),
 								        (["odt"], "fake.odt", False),
 								        ([], "fake.odt", True),
 								    ],
 								)
 								def test_auto_partition_respects_skip_infer_table_types(
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    skip_infer_table_types: list[str], filename: str, has_text_as_html: bool
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								):
 								    with open(example_doc_path(filename), "rb") as f:
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								        elements = partition(file=f, skip_infer_table_types=skip_infer_table_types)
 								    table_elements = [e for e in elements if isinstance(e, Table)]
 								    assert table_elements
 								    for e in table_elements:
 								        assert (e.metadata.text_as_html is not None) == has_text_as_html
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								# ================================================================================================
 								# METADATA BEHAVIORS
 								# ================================================================================================
 								# -- .filetype ------------------------------------------------------------
 								@pytest.mark.parametrize(
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
+								    ("content_type", "shortname", "expected_value"),
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    [
 								        ("text/csv", "csv", "text/csv"),
 								        ("text/html", "html", "text/html"),
 								        ("jdsfjdfsjkds", "pdf", None),
 								    ],
 								)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_adds_filetype_to_metadata(
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    request: FixtureRequest,
 								    content_type: str,
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
+								    shortname: str,
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    expected_value: str | None,
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								):
 								    partition_fn_ = function_mock(
 								        request,
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
+								        f"unstructured.partition.{shortname}.partition_{shortname}",
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								        return_value=[Text("text 1"), Text("text 2")],
 								    )
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
+								    partitioner_loader_get_ = method_mock(
 								        request, _PartitionerLoader, "get", return_value=partition_fn_
 								    )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    elements = partition(
-												refactor: restructure PDF/Image example document organization (#3410)

This PR aims to improve the organization and readability of our example
documents used in unit tests, specifically focusing on PDF and image
files.

### Summary
- Created two new subdirectories in the `example-docs` folder:
  - `pdf/`: for all PDF example files
  - `img/`: for all image example files
- Moved relevant PDF files from `example-docs/` to `example-docs/pdf/`
- Moved relevant image files from `example-docs/` to `example-docs/img/`
- Updated file paths in affected unit & ingest tests to reflect the new
directory structure

### Testing
All unit & ingest tests should be updated and verified to work with the
new file structure.

## Notes
Other file types (e.g., office documents, HTML files) remain in the root
of `example-docs/` for now.

## Next Steps
Consider similar reorganization for other file types if this structure
proves to be beneficial.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-07-18 15:21:32 -07:00
+								        example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
+								    partitioner_loader_get_.assert_called_once()
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert len(elements) == 2
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert all(e.metadata.filetype == expected_value for e in elements)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
 								@pytest.mark.parametrize(
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    "content_type",
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    [
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								        # -- content-type provided as argument --
 								        "application/pdf",
 								        # -- auto-detected content-type --
 								        None,
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    ],
 								)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partitioner(
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
+								    request: FixtureRequest, content_type: str | None
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								):
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    metadata = ElementMetadata(filetype="imapdf")
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    partition_pdf_ = function_mock(
 								        request,
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
+								        "unstructured.partition.pdf.partition_pdf",
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								        return_value=[Text("text 1", metadata=metadata), Text("text 2", metadata=metadata)],
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    )
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
+								    partitioner_loader_get_ = method_mock(
 								        request, _PartitionerLoader, "get", return_value=partition_pdf_
 								    )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												refactor: restructure PDF/Image example document organization (#3410)

This PR aims to improve the organization and readability of our example
documents used in unit tests, specifically focusing on PDF and image
files.

### Summary
- Created two new subdirectories in the `example-docs` folder:
  - `pdf/`: for all PDF example files
  - `img/`: for all image example files
- Moved relevant PDF files from `example-docs/` to `example-docs/pdf/`
- Moved relevant image files from `example-docs/` to `example-docs/img/`
- Updated file paths in affected unit & ingest tests to reflect the new
directory structure

### Testing
All unit & ingest tests should be updated and verified to work with the
new file structure.

## Notes
Other file types (e.g., office documents, HTML files) remain in the root
of `example-docs/` for now.

## Next Steps
Consider similar reorganization for other file types if this structure
proves to be beneficial.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-07-18 15:21:32 -07:00
+								    elements = partition(
 								        example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
 								    )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
+								    partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert len(elements) == 2
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert all(e.metadata.filetype == "application/pdf" for e in elements)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								@pytest.mark.parametrize(
-												fix(auto): quick fix for auto test failing in CI (#3715)

Better fix to follow.
											
										
										
											2024-10-10 11:44:00 -07:00
+								    ("file_name", "file_type"),
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    [
-												fix(auto): quick fix for auto test failing in CI (#3715)

Better fix to follow.
											
										
										
											2024-10-10 11:44:00 -07:00
+								        ("stanley-cups.csv", FileType.CSV),
 								        ("simple.doc", FileType.DOC),
 								        ("simple.docx", FileType.DOCX),
 								        ("fake-email.eml", FileType.EML),
 								        ("simple.epub", FileType.EPUB),
 								        ("fake-html.html", FileType.HTML),
 								        ("README.md", FileType.MD),
 								        ("fake-email.msg", FileType.MSG),
 								        ("simple.odt", FileType.ODT),
 								        ("pdf/DA-1p.pdf", FileType.PDF),
 								        ("fake-power-point.ppt", FileType.PPT),
 								        ("simple.pptx", FileType.PPTX),
 								        ("README.rst", FileType.RST),
 								        ("fake-doc.rtf", FileType.RTF),
 								        ("stanley-cups.tsv", FileType.TSV),
 								        ("fake-text.txt", FileType.TXT),
 								        ("tests-example.xls", FileType.XLSX),
 								        ("stanley-cups.xlsx", FileType.XLSX),
 								        ("factbook.xml", FileType.XML),
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    ],
 								)
-												fix(auto): quick fix for auto test failing in CI (#3715)

Better fix to follow.
											
										
										
											2024-10-10 11:44:00 -07:00
+								def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(
 								    file_name: str, file_type: FileType
 								):
 								    file_path = example_doc_path(file_name)
-												fix(file): fix OLE-based file-type auto-detection (#3437)

**Summary**
A DOC, PPT, or XLS file sent to partition() as a file-like object is
misidentified as a MSG file and raises an exception in python-oxmsg
(which is used to process MSG files).

**Fix**
DOC, PPT, XLS, and MSG are all Microsoft OLE-based files, aka. Compound
File Binary Format (CFBF). These can be reliably distinguished by
inspecting magic bytes in certain locations. `libmagic` is unreliable at
this or doesn't try, reporting the generic `"application/x-ole-storage"`
which corresponds to the "container" CFBF format (vaguely like a
Microsoft Zip format) that all these document types are stored in.

Unconditionally use `filetype.guess_mime()` provided by the `filetype`
package that is part of the base unstructured install. Unlike
`libmagic`, this package reliably detects the distinguished MIME-type
(e.g. `"application/msword"`) for OLE file subtypes.

Fixes #3364
											
										
										
											2024-07-25 10:25:41 -07:00
+								    partition_fn_name = file_type.partitioner_function_name
 								    module = import_module(file_type.partitioner_module_qname)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    partition_fn = getattr(module, partition_fn_name)
-												fix(auto): quick fix for auto test failing in CI (#3715)

Better fix to follow.
											
										
										
											2024-10-10 11:44:00 -07:00
+								    # -- partition the example-doc for this filetype --
-												rfctr(email): eml partitioner rewrite (#3694)

**Summary**
Initial attempts to incrementally refactor `partition_email()` into
shape to allow pluggable partitioning quickly became too complex for
ready code-review. Prepare separate rewritten module and tests and swap
them out whole.

**Additional Context**
- Uses the modern stdlib `email` module to reliably accomplish several
manual decoding steps in the legacy code.
- Remove obsolete email-specific element-types which were replaced 18
months or so ago with email-specific metadata fields for things like Cc:
addresses, subject, etc.
- Remove accepting an email as `text: str` because MIME-email is
inherently a binary format which can and often does contain multiple and
contradictory character-encodings.
- Remove `encoding` parameters as it is now unused. An email file is not
a text file and as such does not have a single overall encoding.
Character encoding is specified individually for each MIME-part within
the message and often varies from one part to another in the same
message.
- Remove the need for a caller to specify `attachment_partitioner`.
There is only one reasonable choice for this which is
`auto.partition()`, consistent with the same interface and operation in
`partition_msg()`.
- Fixes #3671 along the way by silently skipping attachments with a
file-type for which there is no partitioner.
- Substantially extend the test-suite to cover multiple
transport-encoding/charset combinations.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: scanny <scanny@users.noreply.github.com>
											
										
										
											2024-10-15 19:02:33 -07:00
+								    elements = partition_fn(file_path, process_attachments=False)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert elements
 								    assert all(
-												fix(file): fix OLE-based file-type auto-detection (#3437)

**Summary**
A DOC, PPT, or XLS file sent to partition() as a file-like object is
misidentified as a MSG file and raises an exception in python-oxmsg
(which is used to process MSG files).

**Fix**
DOC, PPT, XLS, and MSG are all Microsoft OLE-based files, aka. Compound
File Binary Format (CFBF). These can be reliably distinguished by
inspecting magic bytes in certain locations. `libmagic` is unreliable at
this or doesn't try, reporting the generic `"application/x-ole-storage"`
which corresponds to the "container" CFBF format (vaguely like a
Microsoft Zip format) that all these document types are stored in.

Unconditionally use `filetype.guess_mime()` provided by the `filetype`
package that is part of the base unstructured install. Unlike
`libmagic`, this package reliably detects the distinguished MIME-type
(e.g. `"application/msword"`) for OLE file subtypes.

Fixes #3364
											
										
										
											2024-07-25 10:25:41 -07:00
+								        e.metadata.filetype == file_type.mime_type
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								        for e in elements
 								        if e.metadata.filetype is not None
 								    )
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												Fix file detection when spooled file is pased (#3932)

This pull request fixes the scenario when SpooledTemporaryFile is passed
to detect_file type. In such cases some weird number was assigned as
'name' (and it couldn't be overwritten as SpooledTemporaryFile can't
have fields assigned 😩 ) so I added in our object factory just another
scenario where we parse this type of file.
For BytesIo `name` attr is None as it should be and some other metadata
fields are leveraged for file type recognition
											
										
										
											2025-02-20 14:00:25 +01:00
+								def test_detect_filetype_maps_file_to_bytes_io_when_spooled_temp_file_used(mocker):
 								    detect_filetype_mock = MagicMock(return_value=FileType.JSON)
 								    mocker.patch("unstructured.file_utils.filetype._FileTypeDetector", detect_filetype_mock)
 								    with tempfile.SpooledTemporaryFile() as f:
 								        f.write(b'{"text": Hello, world!}')
 								        f.seek(0)
 								        detect_filetype(file=f)
 								    file_detection_context = detect_filetype_mock.file_type.call_args[0][0]
 								    assert file_detection_context.text_head == '{"text": Hello, world!}'
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# -- .languages -----------------------------------------------------------
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_passes_user_provided_languages_arg_to_PDF():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(
-												refactor: restructure PDF/Image example document organization (#3410)

This PR aims to improve the organization and readability of our example
documents used in unit tests, specifically focusing on PDF and image
files.

### Summary
- Created two new subdirectories in the `example-docs` folder:
  - `pdf/`: for all PDF example files
  - `img/`: for all image example files
- Moved relevant PDF files from `example-docs/` to `example-docs/pdf/`
- Moved relevant image files from `example-docs/` to `example-docs/img/`
- Updated file paths in affected unit & ingest tests to reflect the new
directory structure

### Testing
All unit & ingest tests should be updated and verified to work with the
new file structure.

## Notes
Other file types (e.g., office documents, HTML files) remain in the root
of `example-docs/` for now.

## Next Steps
Consider similar reorganization for other file types if this structure
proves to be beneficial.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
											
										
										
											2024-07-18 15:21:32 -07:00
+								        example_doc_path("pdf/chevron-page.pdf"),
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								        strategy=PartitionStrategy.OCR_ONLY,
 								        languages=["eng"],
 								    )
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    assert all(e.metadata.languages == ["eng"] for e in elements)
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_languages_argument_default_to_None_when_omitted():
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    elements = partition(example_doc_path("handbook-1p.docx"), detect_language_per_element=True)
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								    # -- PageBreak and any other element with no text is assigned `None` --
 								    assert all(e.text == "" for e in elements if e.metadata.languages is None)
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_default_does_not_overwrite_other_defaults():
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    """`partition()` ["eng"] default does not overwrite ["auto"] default in other partitioners."""
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
+								    # the default for `languages` is ["auto"] in partiton_text
 								    from unstructured.partition.text import partition_text
 								    # Use a document that is primarily in a language other than English
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    file_path = example_doc_path("language-docs/UDHR_first_article_all.txt")
 								    text_elements = partition_text(file_path)
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
+								    assert text_elements[0].metadata.languages != ["eng"]
-												rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary**
Improve expression in auto-partition tests and fix xfails and skips. Add
issues for the two hard-fails where xfail needed to stay.
											
										
										
											2024-07-09 22:29:07 -07:00
+								    auto_elements = partition(file_path)
-												detect document language across all partitioners (#1627)

### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
											
										
										
											2023-10-10 20:47:56 -05:00
+								    assert auto_elements[0].metadata.languages != ["eng"]
 								    assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages
-												fix: default to None for the languages metadata field (#1743)

### Summary
Closes #1714
Changes the default value for `languages` to `None` for elements that
don't have text or the language can't be detected.

### Testing
```
from unstructured.partition.auto import partition
filename = "example-docs/handbook-1p.docx"
elements = partition(filename=filename, detect_language_per_element=True)

# PageBreak elements don't have text and will be collected here
none_langs = [element for element in elements if element.metadata.languages is None]
none_langs[0].text
```

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-14 17:46:24 -05:00
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								# ================================================================================================
 								# MISCELLANEOUS BEHAVIORS
 								# ================================================================================================
-												feat: enable request timeout (#2013)

Courtesy @cdpierse.

Adds a test to PR #1529 in accordance with feedback.

Description from original PR:

In python the default behaviour of `requests.get` without a `timeout`
being set is to hang indefinitely. We have a production use case where
the desired behaviour would be to raise a timeout error rather than have
the application just hang.

This PR adds a new optional keyword parameter `request_timeout` to
`partition` which is passed to `file_and_type_from_url` in the case
where we are fetching from a URL. This is then passed to `requests.get`

---------

Co-authored-by: Charles Pierse <charlespierse@gmail.com>
											
										
										
											2023-11-07 18:44:58 -06:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_from_filename_works_on_empty_file():
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    assert partition(example_doc_path("empty.txt")) == []
-												enhancement: add support from bitmap images (#2414)

### Summary

Adds support for bitmap images (`.bmp`) in both file detection and
partitioning. Bitmap images will be processed with `partition_image`
just like JPGs and PNGs.

### Testing

```python
from unstructured.file_utils.filetype import detect_filetype
from unstructured.partition.auto import partition
from PIL import Image

filename = "example-docs/layout-parser-paper-with-table.jpg"
bmp_filename = "~/tmp/ayout-parser-paper-with-table.bmp"

img = Image.open(filename)
img.save(bmp_filename)

detect_filetype(filename=bmp_filename) # Should be FileType.BMP

elements = partition(filename=bmp_filename)
```
											
										
										
											2024-01-17 17:50:36 -05:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								def test_auto_partition_from_file_works_on_empty_file():
-												rfctr(auto): improve typing and organize auto tests (#3355)

**Summary**
In preparation for further work on auto-partitioning (`partition()`),
improve typing and organize `test_auto.py` by introducing categories.
											
										
										
											2024-07-08 14:25:17 -07:00
+								    with open(example_doc_path("empty.txt"), "rb") as f:
 								        assert partition(file=f) == []
-												enhancement: add support from bitmap images (#2414)

### Summary

Adds support for bitmap images (`.bmp`) in both file detection and
partitioning. Bitmap images will be processed with `partition_image`
just like JPGs and PNGs.

### Testing

```python
from unstructured.file_utils.filetype import detect_filetype
from unstructured.partition.auto import partition
from PIL import Image

filename = "example-docs/layout-parser-paper-with-table.jpg"
bmp_filename = "~/tmp/ayout-parser-paper-with-table.bmp"

img = Image.open(filename)
img.save(bmp_filename)

detect_filetype(filename=bmp_filename) # Should be FileType.BMP

elements = partition(filename=bmp_filename)
```
											
										
										
											2024-01-17 17:50:36 -05:00
-												enhancement: process `.p7s` files with `partition_email` (#2521)

### Summary

Closes #2489, which reported an inability to process `.p7s` files. PR
implements two changes:

- If the user selected content type for the email is not available and
there is another valid content type available, fall back to the other
valid content type.
- For signed message, extract the signature and add it to the metadata


### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/eml/signed-doc.p7s"
elements = partition(filename=filename) # should get a message about fall back logic
print(elements[0]) # "This is a test"
elements[0].metadata.to_dict() # Will see the signature
```
											
										
										
											2024-02-07 17:31:49 -05:00
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
+								def test_auto_partition_that_requires_extras_raises_when_dependencies_are_not_installed(
 								    request: FixtureRequest,
 								):
 								    _PartitionerLoader._partitioners.pop(FileType.PDF, None)
 								    dependency_exists_ = function_mock(
 								        request, "unstructured.partition.auto.dependency_exists", return_value=False
 								    )
 								    match = r"partition_pdf\(\) is not available because one or more dependencies are not installed"
 								    with pytest.raises(ImportError, match=match):
-												rfctr(file): refactor detect_filetype() (#3429)

**Summary**
In preparation for fixing a cluster of bugs with automatic file-type
detection and paving the way for some reliability improvements, refactor
`unstructured.file_utils.filetype` module and improve thoroughness of
tests.

**Additional Context**
Factor type-recognition process into three distinct strategies that are
attempted in sequence. Attempted in order of preference,
type-recognition falls to the next strategy when the one before it is
not applicable or cannot determine the file-type. This provides a clear
basis for organizing the code and tests at the top level.

Consolidate the existing tests around these strategies, adding
additional cases to achieve better coverage.

Several bugs were uncovered in the process. Small ones were just fixed,
bigger ones will be remedied in following PRs.
											
										
										
											2024-07-23 16:18:48 -07:00
+								        partition(example_doc_path("pdf/layout-parser-paper-fast.pdf"))
-												rfctr(auto): add _PartitionerLoader (#3418)

**Summary**
Replace conditional explicit import of partitioner modules in
`.partition.auto` with the new `_PartitionerLoader` class. This avoids
unbound variable warnings and is much less noisy.

`_PartitionerLoader` makes use of the new `FileType` property
`.importable_package_dependencies` to determine whether all required
packages are importable before dispatching the file to its partitioner.
It uses `FileType.extra_name` to form a helpful error message when a
dependency is not installed, so the caller knows which `pip install`
extra to specify to remedy the error.

`PartitionerLoader` uses the `FileType` properties
`.partitioner_module_qname` and `partitioner_function_name` to load
the partitioner once its dependencies are verified. Loaded partitioners
are cached with module lifetime scope for efficiency.
											
										
										
											2024-07-21 23:03:55 -07:00
 								    dependency_exists_.assert_called_once_with("pdf2image")
-												enhancement: process `.p7s` files with `partition_email` (#2521)

### Summary

Closes #2489, which reported an inability to process `.p7s` files. PR
implements two changes:

- If the user selected content type for the email is not available and
there is another valid content type available, fall back to the other
valid content type.
- For signed message, extract the signature and add it to the metadata


### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/eml/signed-doc.p7s"
elements = partition(filename=filename) # should get a message about fall back logic
print(elements[0]) # "This is a test"
elements[0].metadata.to_dict() # Will see the signature
```
											
										
										
											2024-02-07 17:31:49 -05:00
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
 								# ================================================================================================
 								# MODULE-LEVEL FIXTURES
 								# ================================================================================================
 								@pytest.fixture()
 								def expected_docx_elements():
 								    return [
 								        Title("These are a few of my favorite things:"),
 								        ListItem("Parrots"),
 								        ListItem("Hockey"),
-												fix: improve false-positive Title elements on Chinese text (#3836)

**Summary**
Improve element-type mapping for Chinese text. Fixes bug where Chinese
text would produce large numbers of false-positive `Title` elements.

Fixes #3084

---------

Co-authored-by: scanny <scanny@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
											
										
										
											2024-12-17 17:16:42 -08:00
+								        Text("Analysis"),
-												rfctr(auto): improve expression in tests (#3384)

**Summary**
In preparation for further work on auto-partitioning, improve the
expression in the test-suite.
											
										
										
											2024-07-11 12:57:28 -07:00
+								        NarrativeText("This is my first thought. This is my second thought."),
 								        NarrativeText("This is my third thought."),
 								        Text("2023"),
 								        Address("DOYLESTOWN, PA 18901"),
 								    ]
-												Enable dynamic file type registration (#3946)

The purpose of this PR is to enable registering new file types
dynamically.

The PR enables this through 2 primary functions:

1. `unstructured.file_utils.model.create_file_type` This registers the
new `FileType` enum which enables the rest of unstructured to understand
a new type of file
2. `unstructured.file_utils.model.register_partitioner` Decorator that
enables registering a partitioner function to run for a file type.

---------

Co-authored-by: Roman Isecke <136338424+rbiseck3@users.noreply.github.com>
											
										
										
											2025-03-06 17:09:42 -05:00
 								def _test_partition_foo():
 								    pass
 								def test_auto_partition_works_with_custom_types(
 								    request: FixtureRequest,
 								):
 								    file_type = create_file_type("FOO", canonical_mime_type="application/foo", extensions=[".foo"])
 								    register_partitioner(file_type)(_test_partition_foo)
 								    loader = _PartitionerLoader()
 								    assert loader.get(file_type) is _test_partition_foo