unstructured/test_unstructured/partition/test_auto.py

import json
import os
import pathlib
import warnings
from importlib import import_module
from unittest.mock import patch

import docx
import pytest

from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import (
    Address,
    ElementMetadata,
    ListItem,
    NarrativeText,
    Table,
    Text,
    Title,
)
from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType
from unstructured.partition import auto
from unstructured.partition.auto import _get_partition_with_extras, partition
from unstructured.partition.common import convert_office_doc
from unstructured.staging.base import elements_to_json

DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")

EXPECTED_EMAIL_OUTPUT = [
    NarrativeText(text="This is a test email to use for unit tests."),
    Title(text="Important points:"),
    ListItem(text="Roses are red"),
    ListItem(text="Violets are blue"),
]

EML_TEST_FILE = "eml/fake-email.eml"

is_in_docker = os.path.exists("/.dockerenv")


def test_auto_partition_email_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
    elements = partition(filename=filename, strategy="hi_res")
    assert len(elements) > 0
    assert elements == EXPECTED_EMAIL_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]


def test_auto_partition_email_from_file():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
    with open(filename) as f:
        elements = partition(file=f, strategy="hi_res")
    assert len(elements) > 0
    assert elements == EXPECTED_EMAIL_OUTPUT


def test_auto_partition_email_from_file_rb():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
    with open(filename, "rb") as f:
        elements = partition(file=f, strategy="hi_res")
    assert len(elements) > 0
    assert elements == EXPECTED_EMAIL_OUTPUT


@pytest.fixture()
def mock_docx_document():
    document = docx.Document()

    document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
    # NOTE(robinson) - this should get picked up as a list item due to the •
    document.add_paragraph("• Parrots", style="Normal")
    document.add_paragraph("Hockey", style="List Bullet")
    # NOTE(robinson) - this should get picked up as a title
    document.add_paragraph("Analysis", style="Normal")
    # NOTE(robinson) - this should get dropped because it is empty
    document.add_paragraph("", style="Normal")
    # NOTE(robinson) - this should get picked up as a narrative text
    document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
    document.add_paragraph("This is my third thought.", style="Body Text")
    # NOTE(robinson) - this should just be regular text
    document.add_paragraph("2023")

    return document


@pytest.fixture()
def expected_docx_elements():
    return [
        Title("These are a few of my favorite things:"),
        ListItem("Parrots"),
        ListItem("Hockey"),
        Title("Analysis"),
        NarrativeText("This is my first thought. This is my second thought."),
        NarrativeText("This is my third thought."),
        Text("2023"),
    ]


def test_auto_partition_docx_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    mock_docx_document.save(filename)

    elements = partition(filename=filename, strategy="hi_res")
    assert elements == expected_docx_elements
    assert elements[0].metadata.filename == os.path.basename(filename)


def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_elements, tmpdir):
    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    mock_docx_document.save(filename)

    with open(filename, "rb") as f:
        elements = partition(file=f, strategy="hi_res")
    assert elements == expected_docx_elements


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
)
def test_auto_partition_doc_with_filename(
    mock_docx_document,
    expected_docx_elements,
    tmpdir,
    pass_metadata_filename,
    content_type,
):
    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
    mock_docx_document.save(docx_filename)
    convert_office_doc(docx_filename, tmpdir.dirname, "doc")
    metadata_filename = doc_filename if pass_metadata_filename else None
    elements = partition(
        filename=doc_filename,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy="hi_res",
    )
    assert elements == expected_docx_elements
    assert elements[0].metadata.filename == "mock_document.doc"
    assert elements[0].metadata.file_directory == tmpdir.dirname


# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
# determine that the file is an .doc document
@pytest.mark.xfail()
def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements, tmpdir):
    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
    mock_docx_document.save(docx_filename)
    convert_office_doc(docx_filename, tmpdir.dirname, "doc")

    with open(doc_filename, "rb") as f:
        elements = partition(file=f, strategy="hi_res")
    assert elements == expected_docx_elements


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_filename(pass_metadata_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
    metadata_filename = filename if pass_metadata_filename else None
    elements = partition(
        filename=filename,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy="hi_res",
    )
    assert len(elements) > 0
    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_file(pass_metadata_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
    metadata_filename = filename if pass_metadata_filename else None
    with open(filename) as f:
        elements = partition(
            file=f,
            metadata_filename=metadata_filename,
            content_type=content_type,
            strategy="hi_res",
        )
    assert len(elements) > 0


def test_auto_partition_html_from_file_rb():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
    with open(filename, "rb") as f:
        elements = partition(file=f, strategy="hi_res")
    assert len(elements) > 0


def test_auto_partition_json_from_filename():
    """Test auto-processing an unstructured json output file by filename."""
    filename = os.path.join(
        EXAMPLE_DOCS_DIRECTORY,
        "..",
        "test_unstructured_ingest",
        "expected-structured-output",
        "azure",
        "spring-weather.html.json",
    )
    with open(filename) as json_f:
        json_data = json.load(json_f)
    json_elems = json.loads(elements_to_json(partition(filename=filename, strategy="hi_res")))
    for elem in json_elems:
        elem.pop("metadata")
    for elem in json_data:
        elem.pop("metadata")
    assert json_data == json_elems


def test_auto_partition_json_raises_with_unprocessable_json(tmpdir):
    # NOTE(robinson) - This is unprocessable because it is not a list of dicts,
    # per the Unstructured ISD format
    text = '{"hi": "there"}'

    filename = os.path.join(tmpdir, "unprocessable.json")
    with open(filename, "w") as f:
        f.write(text)

    with pytest.raises(ValueError):
        partition(filename=filename)


@pytest.mark.xfail(
    reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492",
)
def test_auto_partition_json_from_file():
    """Test auto-processing an unstructured json output file by file handle."""
    filename = os.path.join(
        EXAMPLE_DOCS_DIRECTORY,
        "..",
        "test_unstructured_ingest",
        "expected-structured-output",
        "azure-blob-storage",
        "spring-weather.html.json",
    )
    with open(filename) as json_f:
        json_data = json.load(json_f)
    with open(filename, encoding="utf-8") as partition_f:
        json_elems = json.loads(elements_to_json(partition(file=partition_f, strategy="hi_res")))
    for elem in json_elems:
        # coordinates are always in the element data structures, even if None
        elem.pop("coordinates")
        elem.pop("coordinate_system")
    assert json_data == json_elems


EXPECTED_TEXT_OUTPUT = [
    NarrativeText(text="This is a test document to use for unit tests."),
    Address(text="Doylestown, PA 18901"),
    Title(text="Important points:"),
    ListItem(text="Hamburgers are delicious"),
    ListItem(text="Dogs are the best"),
    ListItem(text="I love fuzzy blankets"),
]


def test_auto_partition_text_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    elements = partition(filename=filename, strategy="hi_res")
    assert len(elements) > 0
    assert elements == EXPECTED_TEXT_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]


def test_auto_partition_text_from_file():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    with open(filename) as f:
        elements = partition(file=f, strategy="hi_res")
    assert len(elements) > 0
    assert elements == EXPECTED_TEXT_OUTPUT


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type, request):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    metadata_filename = filename if pass_metadata_filename else None

    elements = partition(
        filename=filename,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy="hi_res",
    )

    assert isinstance(elements[0], Title)
    assert elements[0].text.startswith("LayoutParser")

    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]

    # NOTE(alan): Xfail since new model skips the word Zejiang
    request.applymarker(pytest.mark.xfail)

    assert isinstance(elements[1], NarrativeText)
    assert elements[1].text.startswith("Zejiang Shen")


def test_auto_partition_pdf_uses_table_extraction():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    with patch(
        "unstructured_inference.inference.layout.process_file_with_model",
    ) as mock_process_file_with_model:
        partition(filename, pdf_infer_table_structure=True, strategy="hi_res")
        assert mock_process_file_with_model.call_args[1]["extract_tables"]


def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")

    mock_return = [NarrativeText("Hello there!")]
    with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
        mock_partition_with_extras_map = {"pdf": mock_partition}
        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
        partition(filename=filename, strategy="fast")

    mock_partition.assert_called_once_with(
        filename=filename,
        metadata_filename=None,
        file=None,
        url=None,
        include_page_breaks=False,
        infer_table_structure=False,
        strategy="fast",
        languages=["eng"],
    )


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, request):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    metadata_filename = filename if pass_metadata_filename else None

    with open(filename, "rb") as f:
        elements = partition(
            file=f,
            metadata_filename=metadata_filename,
            content_type=content_type,
            strategy="hi_res",
        )

    assert isinstance(elements[0], Title)
    assert elements[0].text.startswith("LayoutParser")

    # NOTE(alan): Xfail since new model misses the first word Zejiang
    request.applymarker(pytest.mark.xfail)

    assert isinstance(elements[1], NarrativeText)
    assert elements[1].text.startswith("Zejiang Shen")


def test_auto_partition_formats_languages_for_tesseract():
    filename = "example-docs/chi_sim_image.jpeg"
    with patch(
        "unstructured_inference.inference.layout.process_file_with_model",
    ) as mock_process_file_with_model:
        partition(filename, strategy="hi_res", languages=["zh"])
        mock_process_file_with_model.assert_called_once_with(
            filename,
            is_image=True,
            ocr_languages="chi_sim+chi_sim_vert+chi_tra+chi_tra_vert",
            ocr_mode="entire_page",
            extract_tables=False,
            model_name=None,
        )


def test_auto_partition_warns_with_ocr_languages(caplog):
    filename = "example-docs/chevron-page.pdf"
    partition(filename=filename, strategy="hi_res", ocr_languages="eng")
    assert "The ocr_languages kwarg will be deprecated" in caplog.text


def test_partition_pdf_doesnt_raise_warning():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    # NOTE(robinson): This is the recommended way to check that no warning is emitted,
    # per the pytest docs.
    # ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
    #      #additional-use-cases-of-warnings-in-tests
    with warnings.catch_warnings():
        warnings.simplefilter("error")
        partition(filename=filename, strategy="hi_res")


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_image_default_strategy_hi_res(pass_metadata_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
    metadata_filename = filename if pass_metadata_filename else None
    elements = partition(
        filename=filename,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy="auto",
    )

    # should be same result as test_partition_image_default_strategy_hi_res() in test_image.py
    first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
    assert elements[0].text == first_line
    assert elements[0].metadata.coordinates is not None


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpg(pass_metadata_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
    metadata_filename = filename if pass_metadata_filename else None
    elements = partition(
        filename=filename,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy="auto",
    )
    assert len(elements) > 0


@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpg_from_file(pass_metadata_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
    metadata_filename = filename if pass_metadata_filename else None
    with open(filename, "rb") as f:
        elements = partition(
            file=f,
            metadata_filename=metadata_filename,
            content_type=content_type,
            strategy="auto",
        )
    assert len(elements) > 0


def test_auto_partition_raises_with_bad_type(monkeypatch):
    monkeypatch.setattr(auto, "detect_filetype", lambda *args, **kwargs: None)
    with pytest.raises(ValueError):
        partition(filename="made-up.fake", strategy="hi_res")


EXPECTED_PPTX_OUTPUT = [
    Title(text="Adding a Bullet Slide"),
    ListItem(text="Find the bullet slide layout"),
    ListItem(text="Use _TextFrame.text for first bullet"),
    ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
    NarrativeText(text="Here is a lot of text!"),
    NarrativeText(text="Here is some text in a text box!"),
]


def test_auto_partition_pptx_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
    elements = partition(filename=filename, strategy="hi_res")
    assert elements == EXPECTED_PPTX_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_ppt_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
    elements = partition(filename=filename, strategy="hi_res")
    assert elements == EXPECTED_PPTX_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]


def test_auto_with_page_breaks():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    elements = partition(filename=filename, include_page_breaks=True, strategy="hi_res")
    assert "PageBreak" in [elem.category for elem in elements]


def test_auto_partition_epub_from_filename():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
    elements = partition(filename=filename, strategy="hi_res")
    assert len(elements) > 0
    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")


def test_auto_partition_epub_from_file():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
    with open(filename, "rb") as f:
        elements = partition(file=f, strategy="hi_res")
    assert len(elements) > 0
    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")


EXPECTED_MSG_OUTPUT = [
    NarrativeText(text="This is a test email to use for unit tests."),
    Title(text="Important points:"),
    ListItem(text="Roses are red"),
    ListItem(text="Violets are blue"),
]


def test_auto_partition_msg_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
    elements = partition(filename=filename, strategy="hi_res")
    assert elements == EXPECTED_MSG_OUTPUT


def test_auto_partition_rtf_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
    elements = partition(filename=filename, strategy="hi_res")
    assert elements[0] == Title("My First Heading")


def test_auto_partition_from_url():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
    elements = partition(url=url, content_type="text/plain", strategy="hi_res")
    assert elements[0] == Title("Apache License")
    assert elements[0].metadata.url == url


def test_partition_md_works_with_embedded_html():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
    elements = partition(url=url, content_type="text/markdown", strategy="hi_res")
    elements[0].text
    unstructured_found = False
    for element in elements:
        if "unstructured" in elements[0].text:
            unstructured_found = True
            break
    assert unstructured_found is True


def test_auto_partition_warns_if_header_set_and_not_url(caplog):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
    partition(filename=filename, headers={"Accept": "application/pdf"}, strategy="hi_res")
    assert caplog.records[0].levelname == "WARNING"


def test_auto_partition_works_with_unstructured_jsons():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
    elements = partition(filename=filename, strategy="hi_res")
    assert elements[0].text == "News Around NOAA"


def test_auto_partition_works_with_unstructured_jsons_from_file():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
    with open(filename, "rb") as f:
        elements = partition(file=f, strategy="hi_res")
    assert elements[0].text == "News Around NOAA"


def test_auto_partition_odt_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
    elements = partition(filename=filename, strategy="hi_res")
    assert elements[0] == Title("Lorem ipsum dolor sit amet.")


def test_auto_partition_odt_from_file():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
    with open(filename, "rb") as f:
        elements = partition(file=f, strategy="hi_res")

    assert elements[0] == Title("Lorem ipsum dolor sit amet.")


@pytest.mark.parametrize(
    ("content_type", "routing_func", "expected"),
    [
        ("text/csv", "csv", "text/csv"),
        ("text/html", "html", "text/html"),
        ("jdsfjdfsjkds", "pdf", None),
    ],
)
def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected, monkeypatch):
    with patch(
        f"unstructured.partition.auto.partition_{routing_func}",
        lambda *args, **kwargs: [Text("text 1"), Text("text 2")],
    ) as mock_partition:
        mock_partition_with_extras_map = {routing_func: mock_partition}
        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
        elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
    assert len(elements) == 2
    assert all(el.metadata.filetype == expected for el in elements)


@pytest.mark.parametrize(
    ("content_type", "expected"),
    [
        ("application/pdf", FILETYPE_TO_MIMETYPE[FileType.PDF]),
        (None, FILETYPE_TO_MIMETYPE[FileType.PDF]),
    ],
)
def test_auto_filetype_overrides_file_specific(content_type, expected, monkeypatch):
    pdf_metadata = ElementMetadata(filetype="imapdf")
    with patch(
        "unstructured.partition.auto.partition_pdf",
        lambda *args, **kwargs: [
            Text("text 1", metadata=pdf_metadata),
            Text("text 2", metadata=pdf_metadata),
        ],
    ) as mock_partition:
        mock_partition_with_extras_map = {"pdf": mock_partition}
        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
        elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
    assert len(elements) == 2
    assert all(el.metadata.filetype == expected for el in elements)


supported_filetypes = [
    _
    for _ in FileType
    if _
    not in (
        FileType.UNK,
        FileType.ZIP,
        FileType.XLS,
    )
]


FILETYPE_TO_MODULE = {
    FileType.JPG: "image",
    FileType.PNG: "image",
    FileType.TXT: "text",
    FileType.EML: "email",
}


@pytest.mark.parametrize("filetype", supported_filetypes)
def test_file_specific_produces_correct_filetype(filetype: FileType):
    if filetype in (FileType.JPG, FileType.PNG, FileType.TIFF, FileType.EMPTY):
        pytest.skip()
    extension = filetype.name.lower()
    filetype_module = (
        extension if filetype not in FILETYPE_TO_MODULE else FILETYPE_TO_MODULE[filetype]
    )
    fun_name = "partition_" + filetype_module
    module = import_module(f"unstructured.partition.{filetype_module}")  # noqa
    fun = eval(f"module.{fun_name}")
    for file in pathlib.Path("example-docs").iterdir():
        if file.is_file() and file.suffix == f".{extension}":
            elements = fun(str(file))
            assert all(
                el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype]
                for el in elements
                if el.metadata.filetype is not None
            )
            break


def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"):
    elements = partition(filename=filename, xml_keep_tags=False, metadata_filename=filename)

    assert elements[0].text == "United States"
    assert elements[0].metadata.filename == "factbook.xml"


def test_auto_partition_xml_from_file(filename="example-docs/factbook.xml"):
    with open(filename, "rb") as f:
        elements = partition(file=f, xml_keep_tags=False)

    assert elements[0].text == "United States"


def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
    elements = partition(filename=filename, xml_keep_tags=True)

    assert "<leader>Joe Biden</leader>" in elements[0].text
    assert elements[0].metadata.filename == "factbook.xml"


def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
    with open(filename, "rb") as f:
        elements = partition(file=f, xml_keep_tags=True)

    assert "<leader>Joe Biden</leader>" in elements[0].text


EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"


def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
    elements = partition(filename=filename, include_header=False)

    assert all(isinstance(element, Table) for element in elements)
    assert len(elements) == 2

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.page_number == 1
    assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE


def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
    with open(filename, "rb") as f:
        elements = partition(file=f, include_header=False)

    assert all(isinstance(element, Table) for element in elements)
    assert len(elements) == 2

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.page_number == 1
    assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE


EXPECTED_XLS_TEXT_LEN = 507


EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MA What C datatypes are 8 bits? (assume i386)"

EXPECTED_XLS_TABLE = (
    """<table border="1" class="dataframe">
  <tbody>
    <tr>
      <td>MA</td>
      <td>What C datatypes are 8 bits? (assume i386)</td>
      <td>int</td>
      <td></td>
      <td>float</td>
      <td></td>
      <td>double</td>
      <td></td>
      <td>char</td>
    </tr>
    <tr>
      <td>TF</td>
      <td>Bagpipes are awesome.</td>
      <td>true</td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>ESS</td>
      <td>How have the original Henry Hornbostel buildings """
    """influenced campus architecture and design in the last 30 years?</td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>ORD</td>
      <td>Rank the following in their order of operation.</td>
      <td>Parentheses</td>
      <td>Exponents</td>
      <td>Division</td>
      <td>Addition</td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>FIB</td>
      <td>The student activities fee is</td>
      <td>95</td>
      <td>dollars for students enrolled in</td>
      <td>19</td>
      <td>units or more,</td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>MAT</td>
      <td>Match the lower-case greek letter with its capital form.</td>
      <td>λ</td>
      <td>Λ</td>
      <td>α</td>
      <td>γ</td>
      <td>Γ</td>
      <td>φ</td>
      <td>Φ</td>
    </tr>
  </tbody>
</table>"""
)


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
    elements = partition(filename=filename, include_header=False)

    assert all(isinstance(element, Table) for element in elements)
    assert len(elements) == 3

    assert clean_extra_whitespace(elements[0].text)[:45] == EXPECTED_XLS_INITIAL_45_CLEAN_TEXT
    # NOTE(crag): if the beautifulsoup4 package is installed, some (but not all) additional
    # whitespace is removed, so the expected text length is less than is the case
    # when beautifulsoup4 is *not* installed. E.g.
    # "\n\n\nMA\nWhat C datatypes are 8 bits" vs.
    # '\n  \n    \n      MA\n      What C datatypes are 8 bits?... "
    assert len(elements[0].text) == EXPECTED_XLS_TEXT_LEN
    assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
    elements = partition(filename=filename)

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == "text/csv"


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
    elements = partition(filename=filename)

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == "text/tsv"


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
    with open(filename, "rb") as f:
        elements = partition(file=f)

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert isinstance(elements[0], Table)
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == "text/csv"


def test_auto_partition_html_pre_from_file(filename="example-docs/fake-html-pre.htm"):
    elements = partition(filename=filename)

    assert len(elements) > 0
    assert "PageBreak" not in [elem.category for elem in elements]
    assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
    assert isinstance(elements[0], NarrativeText)
    assert elements[0].metadata.filetype == "text/html"
    assert elements[0].metadata.filename == "fake-html-pre.htm"


def test_auto_partition_works_on_empty_filename(filename="example-docs/empty.txt"):
    assert partition(filename=filename) == []


def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
    with open(filename, "rb") as f:
        assert partition(file=f) == []


def test_auto_partition_org_from_filename(filename="example-docs/README.org"):
    elements = partition(filename=filename)

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/org"


def test_auto_partition_org_from_file(filename="example-docs/README.org"):
    with open(filename, "rb") as f:
        elements = partition(file=f, content_type="text/org")

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/org"


def test_auto_partition_rst_from_filename(filename="example-docs/README.rst"):
    elements = partition(filename=filename)

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/x-rst"


def test_auto_partition_rst_from_file(filename="example-docs/README.rst"):
    with open(filename, "rb") as f:
        elements = partition(file=f, content_type="text/x-rst")

    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/x-rst"


def test_auto_partition_metadata_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    with open(filename) as f:
        elements = partition(file=f, metadata_filename=filename)
    assert elements[0].metadata.filename == os.path.split(filename)[-1]


def test_auto_partition_warns_about_file_filename_deprecation(caplog):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    with open(filename) as f:
        elements = partition(file=f, file_filename=filename)
    assert elements[0].metadata.filename == os.path.split(filename)[-1]
    assert "WARNING" in caplog.text
    assert "The file_filename kwarg will be deprecated" in caplog.text


def test_auto_partition_raises_with_file_and_metadata_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    with open(filename) as f, pytest.raises(ValueError):
        partition(file=f, file_filename=filename, metadata_filename=filename)


def test_get_partition_with_extras_prompts_for_install_if_missing():
    partition_with_extras_map = {}
    with pytest.raises(ImportError) as exception_info:
        _get_partition_with_extras("pdf", partition_with_extras_map)

    msg = str(exception_info.value)
    assert 'Install the pdf dependencies with pip install "unstructured[pdf]"' in msg


def test_add_chunking_strategy_on_partition_auto():
    filename = "example-docs/example-10k-1p.html"
    chunk_elements = partition(filename, chunking_strategy="by_title")
    elements = partition(filename)
    chunks = chunk_by_title(elements)
    assert chunk_elements != elements
    assert chunk_elements == chunks


def test_add_chunking_strategy_on_partition_auto_respects_multipage():
    filename = "example-docs/example-10k-1p.html"
    partitioned_elements_multipage_false_combine_chars_0 = partition(
        filename,
        chunking_strategy="by_title",
        multipage_sections=False,
        combine_under_n_chars=0,
    )
    partitioned_elements_multipage_true_combine_chars_0 = partition(
        filename,
        chunking_strategy="by_title",
        multipage_sections=True,
        combine_under_n_chars=0,
    )
    elements = partition(filename)
    cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title(
        elements,
        multipage_sections=False,
        combine_under_n_chars=0,
    )
    cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title(
        elements,
        multipage_sections=True,
        combine_under_n_chars=0,
    )
    assert (
        partitioned_elements_multipage_false_combine_chars_0
        == cleaned_elements_multipage_false_combine_chars_0
    )
    assert (
        partitioned_elements_multipage_true_combine_chars_0
        == cleaned_elements_multipage_true_combine_chars_0
    )
    assert len(partitioned_elements_multipage_true_combine_chars_0) != len(
        partitioned_elements_multipage_false_combine_chars_0,
    )
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								import json
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								import os
 								import pathlib
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
+								import warnings
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								from importlib import import_module
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								from unittest.mock import patch
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								import docx
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								import pytest
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												feat: partition_tsv for tab separated value files (#758)

* first pass at partition_tsv

* working tests

* create constants for tests and debug `make test` failure

* make check and tidy

* undo changes for testing locally

* update changelog and version

* fix bricks.rst

* refactor if statements

* make tidy

* fix README and change try/except to if/else

* update changelog and version

* fix\ docstring
											
										
										
											2023-06-15 13:50:53 -05:00
+								from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
-												chunk_by_title decorator (#1304)

### Summary

Partial solution to #1185.
Related to #1222.
Creates decorator from `chunk_by_title` cleaning brick.
Breaks a document into sections based on the presence of Title elements.
Also starts a new section under the following conditions:

- If metadata changes, indicating a change in section or page or a
switch to processing attachments. If `multipage_sections=True`, sections
can span pages. `multipage_sections` defaults to True.
- If the length of the section exceeds `new_after_n_chars` characters.
The default is 1500. The **chunking function does not split individual
elements**, so it's possible for a section to exceed that threshold if
an individual element if over `new_after_n_chars characters`, which
could occur with a long NarrativeText element.

Combines sections under these conditions
- Sections under `combine_under_n_chars` characters are combined. The
default is 500.

### Testing

from unstructured.partition.html import partition_html

url = "https://understandingwar.org/backgrounder/russian-offensive-campaign-assessment-august-27-2023-0"
chunks = partition_html(url=url, chunking_strategy="by_title")

for chunk in chunks:
    print(chunk)
    print("\n\n" + "-"*80)
    input()

											
										
										
											2023-09-11 16:00:14 -05:00
+								from unstructured.chunking.title import chunk_by_title
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								from unstructured.cleaners.core import clean_extra_whitespace
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								from unstructured.documents.elements import (
 								    Address,
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								    ElementMetadata,
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ListItem,
 								    NarrativeText,
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								    Table,
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    Text,
 								    Title,
 								)
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								from unstructured.partition import auto
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								from unstructured.partition.auto import _get_partition_with_extras, partition
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
+								from unstructured.partition.common import convert_office_doc
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								from unstructured.staging.base import elements_to_json
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								DIRECTORY = pathlib.Path(__file__).parent.resolve()
 								EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								EXPECTED_EMAIL_OUTPUT = [
 								    NarrativeText(text="This is a test email to use for unit tests."),
 								    Title(text="Important points:"),
 								    ListItem(text="Roses are red"),
 								    ListItem(text="Violets are blue"),
 								]
-												Bug/635 unicode decode error eml (#739)

* Adds functionality to extract charset info from eml files
* Adds missed file-like object handling in detect_file_encoding
* Adds functionality to replace the MIME encodings for eml files with one of the
   common encodings if a unicode error occurs
* Organize the eml example files in the example-docs/eml directory

											
										
										
											2023-06-16 17:52:13 -07:00
+								EML_TEST_FILE = "eml/fake-email.eml"
-												fix: correct order of kwargs in pandoc (#421)

* fix: correct order of kwargs in pandoc

* only skip epub tests in Docker

* changelog

---------

Co-authored-by: Crag Wolfe <crag@unstructuredai.io>
Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-30 16:54:29 -04:00
+								is_in_docker = os.path.exists("/.dockerenv")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								def test_auto_partition_email_from_filename():
-												Bug/635 unicode decode error eml (#739)

* Adds functionality to extract charset info from eml files
* Adds missed file-like object handling in detect_file_encoding
* Adds functionality to replace the MIME encodings for eml files with one of the
   common encodings if a unicode error occurs
* Organize the eml example files in the example-docs/eml directory

											
										
										
											2023-06-16 17:52:13 -07:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_EMAIL_OUTPUT
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								def test_auto_partition_email_from_file():
-												Bug/635 unicode decode error eml (#739)

* Adds functionality to extract charset info from eml files
* Adds missed file-like object handling in detect_file_encoding
* Adds functionality to replace the MIME encodings for eml files with one of the
   common encodings if a unicode error occurs
* Organize the eml example files in the example-docs/eml directory

											
										
										
											2023-06-16 17:52:13 -07:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    with open(filename) as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_EMAIL_OUTPUT
 								def test_auto_partition_email_from_file_rb():
-												Bug/635 unicode decode error eml (#739)

* Adds functionality to extract charset info from eml files
* Adds missed file-like object handling in detect_file_encoding
* Adds functionality to replace the MIME encodings for eml files with one of the
   common encodings if a unicode error occurs
* Organize the eml example files in the example-docs/eml directory

											
										
										
											2023-06-16 17:52:13 -07:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_EMAIL_OUTPUT
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								@pytest.fixture()
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								def mock_docx_document():
 								    document = docx.Document()
 								    document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
 								    # NOTE(robinson) - this should get picked up as a list item due to the •
 								    document.add_paragraph("• Parrots", style="Normal")
 								    document.add_paragraph("Hockey", style="List Bullet")
 								    # NOTE(robinson) - this should get picked up as a title
 								    document.add_paragraph("Analysis", style="Normal")
 								    # NOTE(robinson) - this should get dropped because it is empty
 								    document.add_paragraph("", style="Normal")
 								    # NOTE(robinson) - this should get picked up as a narrative text
 								    document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
 								    document.add_paragraph("This is my third thought.", style="Body Text")
 								    # NOTE(robinson) - this should just be regular text
 								    document.add_paragraph("2023")
 								    return document
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								@pytest.fixture()
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								def expected_docx_elements():
 								    return [
 								        Title("These are a few of my favorite things:"),
 								        ListItem("Parrots"),
 								        ListItem("Hockey"),
 								        Title("Analysis"),
 								        NarrativeText("This is my first thought. This is my second thought."),
 								        NarrativeText("This is my third thought."),
 								        Text("2023"),
 								    ]
 								def test_auto_partition_docx_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
 								    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
 								    mock_docx_document.save(filename)
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert elements == expected_docx_elements
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_elements, tmpdir):
 								    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
 								    mock_docx_document.save(filename)
 								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert elements == expected_docx_elements
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
 								)
 								def test_auto_partition_doc_with_filename(
 								    mock_docx_document,
 								    expected_docx_elements,
 								    tmpdir,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    pass_metadata_filename,
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    content_type,
 								):
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
+								    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
 								    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
 								    mock_docx_document.save(docx_filename)
 								    convert_office_doc(docx_filename, tmpdir.dirname, "doc")
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    metadata_filename = doc_filename if pass_metadata_filename else None
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    elements = partition(
 								        filename=doc_filename,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								        metadata_filename=metadata_filename,
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								        content_type=content_type,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        strategy="hi_res",
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    )
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
+								    assert elements == expected_docx_elements
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == "mock_document.doc"
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == tmpdir.dirname
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
 								# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
 								# determine that the file is an .doc document
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								@pytest.mark.xfail()
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
+								def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements, tmpdir):
 								    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
 								    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
 								    mock_docx_document.save(docx_filename)
 								    convert_office_doc(docx_filename, tmpdir.dirname, "doc")
 								    with open(doc_filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
+								    assert elements == expected_docx_elements
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
 								)
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								def test_auto_partition_html_from_filename(pass_metadata_filename, content_type):
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    metadata_filename = filename if pass_metadata_filename else None
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(
 								        filename=filename,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								        metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        content_type=content_type,
 								        strategy="hi_res",
 								    )
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
 								)
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								def test_auto_partition_html_from_file(pass_metadata_filename, content_type):
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    metadata_filename = filename if pass_metadata_filename else None
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    with open(filename) as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(
 								            file=f,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								            metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								            content_type=content_type,
 								            strategy="hi_res",
 								        )
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
 								def test_auto_partition_html_from_file_rb():
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								def test_auto_partition_json_from_filename():
 								    """Test auto-processing an unstructured json output file by filename."""
 								    filename = os.path.join(
 								        EXAMPLE_DOCS_DIRECTORY,
 								        "..",
 								        "test_unstructured_ingest",
 								        "expected-structured-output",
-												chore: refactor ingest tests (#814)

- Adds reusable validation scripts (check-x.sh) to minimize repeated (or near-repeated) code and create one source of truth
- Restructures the location of download and output folders such that they are nested in the test_unstructured_ingest directory
- Adds gitignore for output folders / files to avoid them accidentally getting checked into the repository
- Construct paths as reusable variables declared at top of scripts
- Sort order of flag for ingest calls, across all tests (this makes it easier to parse at a glance)
- OVERWRITE_FIXTURES removes all old fixtures for path to guarantee no stale results are left behind
- Bonus: don't check/exit on expected number of expected outputs when OVERWRITE_FIXTURES is true
- Bonus: exclude file_directory from Slack and Discord test scripts (match convention in all others)
											
										
										
											2023-06-29 16:13:41 -07:00
+								        "azure",
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								        "spring-weather.html.json",
 								    )
 								    with open(filename) as json_f:
 								        json_data = json.load(json_f)
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    json_elems = json.loads(elements_to_json(partition(filename=filename, strategy="hi_res")))
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								    for elem in json_elems:
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        elem.pop("metadata")
 								    for elem in json_data:
 								        elem.pop("metadata")
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								    assert json_data == json_elems
-												refactor: simplifies JSON detection and add tests (#975)

* refactor json detection

* version and changelog

* fix mock in test
											
										
										
											2023-07-25 15:59:45 -04:00
+								def test_auto_partition_json_raises_with_unprocessable_json(tmpdir):
 								    # NOTE(robinson) - This is unprocessable because it is not a list of dicts,
 								    # per the Unstructured ISD format
 								    text = '{"hi": "there"}'
 								    filename = os.path.join(tmpdir, "unprocessable.json")
 								    with open(filename, "w") as f:
 								        f.write(text)
 								    with pytest.raises(ValueError):
 								        partition(filename=filename)
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								@pytest.mark.xfail(
 								    reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492",
 								)
 								def test_auto_partition_json_from_file():
 								    """Test auto-processing an unstructured json output file by file handle."""
 								    filename = os.path.join(
 								        EXAMPLE_DOCS_DIRECTORY,
 								        "..",
 								        "test_unstructured_ingest",
 								        "expected-structured-output",
 								        "azure-blob-storage",
 								        "spring-weather.html.json",
 								    )
 								    with open(filename) as json_f:
 								        json_data = json.load(json_f)
 								    with open(filename, encoding="utf-8") as partition_f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        json_elems = json.loads(elements_to_json(partition(file=partition_f, strategy="hi_res")))
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								    for elem in json_elems:
 								        # coordinates are always in the element data structures, even if None
 								        elem.pop("coordinates")
-												feat: coordinate systems (#774)

Added the CoordinateSystem class for tracking the system in which coordinates are represented, and changing the system if desired.
											
										
										
											2023-06-20 11:19:55 -05:00
+								        elem.pop("coordinate_system")
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								    assert json_data == json_elems
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
+								EXPECTED_TEXT_OUTPUT = [
 								    NarrativeText(text="This is a test document to use for unit tests."),
-												fix: cleanup from live `.docx` tests (#177)

* add env var for cap threshold; raise default threshold

* update docs and tests

* added check for ending in a comma

* update docs

* no caps check for all upper text

* capture Text in html and text

* check category in Text equality check

* lower case all caps before checking for verbs

* added check for us city/state/zip

* added address type

* add address to html

* add address to text

* fix for text tests; escape for large text segments

* refactor regex for readability

* update comment

* additional test for text with linebreaks

* update docs

* update changelog

* update elements docs

* remove old comment

* case -> cast

* type fix
											
										
										
											2023-01-26 10:52:25 -05:00
+								    Address(text="Doylestown, PA 18901"),
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
+								    Title(text="Important points:"),
 								    ListItem(text="Hamburgers are delicious"),
 								    ListItem(text="Dogs are the best"),
 								    ListItem(text="I love fuzzy blankets"),
 								]
 								def test_auto_partition_text_from_filename():
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_TEXT_OUTPUT
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
 								def test_auto_partition_text_from_file():
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    with open(filename) as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_TEXT_OUTPUT
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
 								)
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type, request):
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    metadata_filename = filename if pass_metadata_filename else None
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(
 								        filename=filename,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								        metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        content_type=content_type,
 								        strategy="hi_res",
 								    )
-												chore: return `Element` objects in `partition_pdf` and `partition_image` (#164)

* helper function to convert to element

* test for element types

* fix for healthcheck url

* version bump

* note on coordinates

* mention FigureCaption

* test_shared -> test_common

* add check boxes for checkbox template

* update changelog
											
										
										
											2023-01-19 09:29:28 -05:00
 								    assert isinstance(elements[0], Title)
 								    assert elements[0].text.startswith("LayoutParser")
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: add metadata tracking to document elements (#225)

* add metadata field to elements

* metadata tracking for pdf/image

* metadata for html

* update expected outputs

* metadata for the rest of the document types

* take out file metadata for now

* add url to tables

* added metadata to test_auto

* bump version

* added coordinates to __init__

* fix coordinates in tests
											
										
										
											2023-02-15 13:26:20 -05:00
-												build(deps): update inference version (#662)

Updated to the the latest version of unstructured-inference. detectron2 now gets implemented with onnxruntime, yay!

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
											
										
										
											2023-05-31 13:50:15 -05:00
+								    # NOTE(alan): Xfail since new model skips the word Zejiang
 								    request.applymarker(pytest.mark.xfail)
 								    assert isinstance(elements[1], NarrativeText)
 								    assert elements[1].text.startswith("Zejiang Shen")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												feat: extract tables (#503)

Exposes table extraction through partition and partition_pdf.
											
										
										
											2023-04-21 12:01:29 -05:00
+								def test_auto_partition_pdf_uses_table_extraction():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
 								    with patch(
 								        "unstructured_inference.inference.layout.process_file_with_model",
 								    ) as mock_process_file_with_model:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        partition(filename, pdf_infer_table_structure=True, strategy="hi_res")
-												feat: extract tables (#503)

Exposes table extraction through partition and partition_pdf.
											
										
										
											2023-04-21 12:01:29 -05:00
+								        assert mock_process_file_with_model.call_args[1]["extract_tables"]
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
 								    mock_return = [NarrativeText("Hello there!")]
 								    with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								        mock_partition_with_extras_map = {"pdf": mock_partition}
 								        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								        partition(filename=filename, strategy="fast")
 								    mock_partition.assert_called_once_with(
 								        filename=filename,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								        metadata_filename=None,
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								        file=None,
 								        url=None,
 								        include_page_breaks=False,
-												chore: change table param name (#513)

Updated parameter names that controls whether we try to infer table structure.
											
										
										
											2023-04-21 13:48:19 -05:00
+								        infer_table_structure=False,
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								        strategy="fast",
-												chore: refactor languages parameter for auto partition (#1400)

### Summary
In order to support language functionality other than Tesseract OCR, we
want to represent languages provided for either partitioning accuracy or
OCR as a standard list of langcodes as strings.

### Details
Follows the pattern established with PDFs in #1334. Adds languages (a
list of strings) as a parameter to partition in auto.py. Marks
ocr_languages for deprecation.

### Test
Call partition with a variety of filetypes (especially pdfs/images),
strategies, languages, or ocr_languages.
- inclusion of ocr_languages as a parameter should display a deprecation
warning and may proceed with partitioning if no other conflicts
- the other valid call outputs should be no different from the current
outputs
											
										
										
											2023-09-13 13:07:28 -04:00
+								        languages=["eng"],
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								    )
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
 								)
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, request):
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    metadata_filename = filename if pass_metadata_filename else None
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(
 								            file=f,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								            metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								            content_type=content_type,
 								            strategy="hi_res",
 								        )
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
 								    assert isinstance(elements[0], Title)
 								    assert elements[0].text.startswith("LayoutParser")
-												build(deps): update inference version (#662)

Updated to the the latest version of unstructured-inference. detectron2 now gets implemented with onnxruntime, yay!

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
											
										
										
											2023-05-31 13:50:15 -05:00
+								    # NOTE(alan): Xfail since new model misses the first word Zejiang
 								    request.applymarker(pytest.mark.xfail)
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
+								    assert isinstance(elements[1], NarrativeText)
-												build(deps): bump requirements (#414)


											
										
										
											2023-04-04 19:59:06 -07:00
+								    assert elements[1].text.startswith("Zejiang Shen")
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
-												chore: function to map between standard and Tesseract language codes (#1421)

### Summary
In order to convert between incompatible language codes from packages
used for OCR, this change adds a function to map between any standard
language codes and tesseract OCR specific codes. Users can input
language information to `languages` in any Tesseract-supported langcode
or any ISO 639 standard language code.

### Details
- Introduces the
[python-iso639](https://pypi.org/project/python-iso639/) package for
matching standard language codes. Recompiles all dependencies.
- If a language is not already supplied by the user as a Tesseract
specific langcode, supplies all possible script/orthography variants of
the language to the Tesseract OCR agent.

### Test
Added many unit tests for a variety of language combinations, special
cases, and variants. For general testing, call partition functions with
any lang codes in the languages parameter (Tesseract or standard).

for example,
```
from unstructured.partition.auto import partition

elements = partition(filename="example-docs/layout-parser-paper.pdf", strategy="hi_res", languages=["en", "chi"])
print("\n\n".join([str(el) for el in elements]))
```
should supply eng+chi_sim+chi_sim_vert+chi_tra+chi_tra_vert to Tesseract
											
										
										
											2023-09-18 11:42:02 -04:00
+								def test_auto_partition_formats_languages_for_tesseract():
 								    filename = "example-docs/chi_sim_image.jpeg"
 								    with patch(
 								        "unstructured_inference.inference.layout.process_file_with_model",
 								    ) as mock_process_file_with_model:
 								        partition(filename, strategy="hi_res", languages=["zh"])
 								        mock_process_file_with_model.assert_called_once_with(
 								            filename,
 								            is_image=True,
 								            ocr_languages="chi_sim+chi_sim_vert+chi_tra+chi_tra_vert",
 								            ocr_mode="entire_page",
 								            extract_tables=False,
 								            model_name=None,
 								        )
-												chore: refactor languages parameter for auto partition (#1400)

### Summary
In order to support language functionality other than Tesseract OCR, we
want to represent languages provided for either partitioning accuracy or
OCR as a standard list of langcodes as strings.

### Details
Follows the pattern established with PDFs in #1334. Adds languages (a
list of strings) as a parameter to partition in auto.py. Marks
ocr_languages for deprecation.

### Test
Call partition with a variety of filetypes (especially pdfs/images),
strategies, languages, or ocr_languages.
- inclusion of ocr_languages as a parameter should display a deprecation
warning and may proceed with partitioning if no other conflicts
- the other valid call outputs should be no different from the current
outputs
											
										
										
											2023-09-13 13:07:28 -04:00
+								def test_auto_partition_warns_with_ocr_languages(caplog):
 								    filename = "example-docs/chevron-page.pdf"
 								    partition(filename=filename, strategy="hi_res", ocr_languages="eng")
 								    assert "The ocr_languages kwarg will be deprecated" in caplog.text
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
+								def test_partition_pdf_doesnt_raise_warning():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
 								    # NOTE(robinson): This is the recommended way to check that no warning is emitted,
 								    # per the pytest docs.
 								    # ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
 								    #      #additional-use-cases-of-warnings-in-tests
 								    with warnings.catch_warnings():
 								        warnings.simplefilter("error")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        partition(filename=filename, strategy="hi_res")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												Set default strategy for images to be "hi_res" (#968)

Set default strategy for images (not PDFs) to be hi_res.
											
										
										
											2023-08-02 09:22:20 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Set default strategy for images to be "hi_res" (#968)

Set default strategy for images (not PDFs) to be hi_res.
											
										
										
											2023-08-02 09:22:20 -07:00
+								    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 								)
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								def test_auto_partition_image_default_strategy_hi_res(pass_metadata_filename, content_type):
-												Set default strategy for images to be "hi_res" (#968)

Set default strategy for images (not PDFs) to be hi_res.
											
										
										
											2023-08-02 09:22:20 -07:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    metadata_filename = filename if pass_metadata_filename else None
-												Set default strategy for images to be "hi_res" (#968)

Set default strategy for images (not PDFs) to be hi_res.
											
										
										
											2023-08-02 09:22:20 -07:00
+								    elements = partition(
 								        filename=filename,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								        metadata_filename=metadata_filename,
-												Set default strategy for images to be "hi_res" (#968)

Set default strategy for images (not PDFs) to be hi_res.
											
										
										
											2023-08-02 09:22:20 -07:00
+								        content_type=content_type,
 								        strategy="auto",
 								    )
 								    # should be same result as test_partition_image_default_strategy_hi_res() in test_image.py
 								    first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
 								    assert elements[0].text == first_line
 								    assert elements[0].metadata.coordinates is not None
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 								)
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								def test_auto_partition_jpg(pass_metadata_filename, content_type):
-												enhancement: auto strategy for PDFs and images (#578)

* added functions for determining auto stratgy

* change default strategy to auto

* tests for auto strategy

* update docs

* changelog and version

* bump version

* remove ingest file in wrong location

* update jpg output

* typo fix
											
										
										
											2023-05-12 13:45:08 -04:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    metadata_filename = filename if pass_metadata_filename else None
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(
 								        filename=filename,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								        metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        content_type=content_type,
-												Fix: Pass `strategy` parameter down from `partition` for `partition_image` (#708)

* changelog and version

* passing param down

* test should be auto

* doc nit

* lint

* update image output
											
										
										
											2023-06-09 13:54:18 -04:00
+								        strategy="auto",
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    )
-												feat: partition image (#144)

Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
											
										
										
											2023-01-13 22:24:13 -06:00
+								    assert len(elements) > 0
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    ("pass_metadata_filename", "content_type"),
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 								)
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								def test_auto_partition_jpg_from_file(pass_metadata_filename, content_type):
-												enhancement: auto strategy for PDFs and images (#578)

* added functions for determining auto stratgy

* change default strategy to auto

* tests for auto strategy

* update docs

* changelog and version

* bump version

* remove ingest file in wrong location

* update jpg output

* typo fix
											
										
										
											2023-05-12 13:45:08 -04:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    metadata_filename = filename if pass_metadata_filename else None
-												feat: partition image (#144)

Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
											
										
										
											2023-01-13 22:24:13 -06:00
+								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(
 								            file=f,
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								            metadata_filename=metadata_filename,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								            content_type=content_type,
-												Fix: Pass `strategy` parameter down from `partition` for `partition_image` (#708)

* changelog and version

* passing param down

* test should be auto

* doc nit

* lint

* update image output
											
										
										
											2023-06-09 13:54:18 -04:00
+								            strategy="auto",
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        )
-												feat: partition image (#144)

Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
											
										
										
											2023-01-13 22:24:13 -06:00
+								    assert len(elements) > 0
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								def test_auto_partition_raises_with_bad_type(monkeypatch):
 								    monkeypatch.setattr(auto, "detect_filetype", lambda *args, **kwargs: None)
 								    with pytest.raises(ValueError):
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        partition(filename="made-up.fake", strategy="hi_res")
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
 								EXPECTED_PPTX_OUTPUT = [
 								    Title(text="Adding a Bullet Slide"),
 								    ListItem(text="Find the bullet slide layout"),
 								    ListItem(text="Use _TextFrame.text for first bullet"),
 								    ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
 								    NarrativeText(text="Here is a lot of text!"),
 								    NarrativeText(text="Here is some text in a text box!"),
 								]
 								def test_auto_partition_pptx_from_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    assert elements == EXPECTED_PPTX_OUTPUT
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: optional page breaks for `.pptx`, `.pdf`, `.html` and images (#205)

* page breaks for pptx

* added page breaks for image/pdf

* tests for images with page breaks

* page breaks for html documents

* linting, linting, linting

* changelog and bump version

* update docs

* fix typo

* refactor reusable code to common.py

* add type back in
											
										
										
											2023-02-08 10:11:15 -05:00
-												fix: correct order of kwargs in pandoc (#421)

* fix: correct order of kwargs in pandoc

* only skip epub tests in Docker

* changelog

---------

Co-authored-by: Crag Wolfe <crag@unstructuredai.io>
Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-30 16:54:29 -04:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-												feat: add `partition_ppt` for older power point docs (#238)

* added partition_ppt function and tests

* add ppt support to auto

* version bump

* update docs

* doc fixes

* update changelog

* `.docx` -> `.pptx`

* its -> their

* remove whitespace
											
										
										
											2023-02-17 11:57:08 -05:00
+								def test_auto_partition_ppt_from_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: add `partition_ppt` for older power point docs (#238)

* added partition_ppt function and tests

* add ppt support to auto

* version bump

* update docs

* doc fixes

* update changelog

* `.docx` -> `.pptx`

* its -> their

* remove whitespace
											
										
										
											2023-02-17 11:57:08 -05:00
+								    assert elements == EXPECTED_PPTX_OUTPUT
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: add `partition_ppt` for older power point docs (#238)

* added partition_ppt function and tests

* add ppt support to auto

* version bump

* update docs

* doc fixes

* update changelog

* `.docx` -> `.pptx`

* its -> their

* remove whitespace
											
										
										
											2023-02-17 11:57:08 -05:00
-												feat: optional page breaks for `.pptx`, `.pdf`, `.html` and images (#205)

* page breaks for pptx

* added page breaks for image/pdf

* tests for images with page breaks

* page breaks for html documents

* linting, linting, linting

* changelog and bump version

* update docs

* fix typo

* refactor reusable code to common.py

* add type back in
											
										
										
											2023-02-08 10:11:15 -05:00
+								def test_auto_with_page_breaks():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, include_page_breaks=True, strategy="hi_res")
-												enhancement: clean pdf elements (bump unstructured-inference) (#790)

More deterministic element ordering when using hi_res PDF parsing strategy (from unstructured-inference bump to 0.5.4)
Make large model available (from unstructured-inference bump to 0.5.3)
Combine inferred elements with extracted elements (from unstructured-inference bump to 0.5.2)

---------

Co-authored-by: Roman Isecke <roman@unstructured.io>
Co-authored-by: Crag Wolfe <crag@unstructured.io>
											
										
										
											2023-06-29 20:35:06 -05:00
+								    assert "PageBreak" in [elem.category for elem in elements]
-												feat: add `partition_epub` function (#364)

* add pypandoc dependency

* added epub partitioner and file conversion

* test for partition_epub

* tests for file conversion

* add epub to filetype detection

* added epub to auto partition

* update bricks docs

* updated installing docs

* changelot and version

* add pandoc to dependencies

* add pandoc to debian dependencies

* linting, linting, linting

* typo fix

* typo fix

* file conversion type hints

* more type hints

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-03-14 11:52:21 -04:00
 								def test_auto_partition_epub_from_filename():
 								    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: add `partition_epub` function (#364)

* add pypandoc dependency

* added epub partitioner and file conversion

* test for partition_epub

* tests for file conversion

* add epub to filetype detection

* added epub to auto partition

* update bricks docs

* updated installing docs

* changelot and version

* add pandoc to dependencies

* add pandoc to debian dependencies

* linting, linting, linting

* typo fix

* typo fix

* file conversion type hints

* more type hints

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-03-14 11:52:21 -04:00
+								    assert len(elements) > 0
 								    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
 								def test_auto_partition_epub_from_file():
 								    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
 								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: add `partition_epub` function (#364)

* add pypandoc dependency

* added epub partitioner and file conversion

* test for partition_epub

* tests for file conversion

* add epub to filetype detection

* added epub to auto partition

* update bricks docs

* updated installing docs

* changelot and version

* add pandoc to dependencies

* add pandoc to debian dependencies

* linting, linting, linting

* typo fix

* typo fix

* file conversion type hints

* more type hints

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-03-14 11:52:21 -04:00
+								    assert len(elements) > 0
 								    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
-												feat: add `partition_msg` for MSFT Outlook files (#412)

* added msg-parser dependency

* pass through kwargs in convert_file_to_text

* added partition_msg for processing msft outlook files

* version bump and changelog

* added tests for partition_msg

* added test for msg with plain text

* add partition_msg docs; fix underlines in integration docs

* add .msg to file list

* finish tests for auto msg

* linting, linting, linting
											
										
										
											2023-03-28 16:15:22 -04:00
 								EXPECTED_MSG_OUTPUT = [
 								    NarrativeText(text="This is a test email to use for unit tests."),
 								    Title(text="Important points:"),
 								    ListItem(text="Roses are red"),
 								    ListItem(text="Violets are blue"),
 								]
 								def test_auto_partition_msg_from_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: add `partition_msg` for MSFT Outlook files (#412)

* added msg-parser dependency

* pass through kwargs in convert_file_to_text

* added partition_msg for processing msft outlook files

* version bump and changelog

* added tests for partition_msg

* added test for msg with plain text

* add partition_msg docs; fix underlines in integration docs

* add .msg to file list

* finish tests for auto msg

* linting, linting, linting
											
										
										
											2023-03-28 16:15:22 -04:00
+								    assert elements == EXPECTED_MSG_OUTPUT
-												feat: add `partition_rtf` for rich text files (#466)

* refactor epub; add rtf

* added test for rtf files

* filetype detection for rtf files

* add rtf to auto

* update docs for group_broken_paragraphs

* add rtf to docs

* update file list in readme

* update stage_for_transformers docs

* changelog and version bump

* skip rtf if in docker

* skip test if rtf not supported

* docs tweaks
											
										
										
											2023-04-10 17:25:03 -04:00
 								def test_auto_partition_rtf_from_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: add `partition_rtf` for rich text files (#466)

* refactor epub; add rtf

* added test for rtf files

* filetype detection for rtf files

* add rtf to auto

* update docs for group_broken_paragraphs

* add rtf to docs

* update file list in readme

* update stage_for_transformers docs

* changelog and version bump

* skip rtf if in docker

* skip test if rtf not supported

* docs tweaks
											
										
										
											2023-04-10 17:25:03 -04:00
+								    assert elements[0] == Title("My First Heading")
-												feat: add `url` kwarg to `partititon` (#470)

* added url option to auto partition

* add test for partition from url

* version and changelog

* update docs

* add url to element metadata
											
										
										
											2023-04-12 14:31:01 -04:00
 								def test_auto_partition_from_url():
 								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(url=url, content_type="text/plain", strategy="hi_res")
-												feat: add `url` kwarg to `partititon` (#470)

* added url option to auto partition

* add test for partition from url

* version and changelog

* update docs

* add url to element metadata
											
										
										
											2023-04-12 14:31:01 -04:00
+								    assert elements[0] == Title("Apache License")
 								    assert elements[0].metadata.url == url
-												feat: allow headers in `partition` (#473)

* feat: allow headers in `partition`

* warning if header is set and url is not

* update emoji test
											
										
										
											2023-04-13 11:04:15 -04:00
-												fix: updates markdown code to process markdown with embedded html (#480)

* add carriage return to html if missing

* test on markdown with embedded html

* changelog and version

* check for html parser

* linting, linting, linting
											
										
										
											2023-04-13 12:47:45 -04:00
+								def test_partition_md_works_with_embedded_html():
 								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(url=url, content_type="text/markdown", strategy="hi_res")
-												fix: updates markdown code to process markdown with embedded html (#480)

* add carriage return to html if missing

* test on markdown with embedded html

* changelog and version

* check for html parser

* linting, linting, linting
											
										
										
											2023-04-13 12:47:45 -04:00
+								    elements[0].text
 								    unstructured_found = False
 								    for element in elements:
 								        if "unstructured" in elements[0].text:
 								            unstructured_found = True
 								            break
 								    assert unstructured_found is True
-												feat: allow headers in `partition` (#473)

* feat: allow headers in `partition`

* warning if header is set and url is not

* update emoji test
											
										
										
											2023-04-13 11:04:15 -04:00
+								def test_auto_partition_warns_if_header_set_and_not_url(caplog):
-												Bug/635 unicode decode error eml (#739)

* Adds functionality to extract charset info from eml files
* Adds missed file-like object handling in detect_file_encoding
* Adds functionality to replace the MIME encodings for eml files with one of the
   common encodings if a unicode error occurs
* Organize the eml example files in the example-docs/eml directory

											
										
										
											2023-06-16 17:52:13 -07:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    partition(filename=filename, headers={"Accept": "application/pdf"}, strategy="hi_res")
-												feat: allow headers in `partition` (#473)

* feat: allow headers in `partition`

* warning if header is set and url is not

* update emoji test
											
										
										
											2023-04-13 11:04:15 -04:00
+								    assert caplog.records[0].levelname == "WARNING"
-												fix: update `detect_filetype` for JSONs with text/plain MIME type (#520)

* check to see if text file is a json

* add json check into filetype detection

* added test for updated file detection logic

* bytes/strings handling

* changlog and version bump
											
										
										
											2023-04-26 13:52:47 -04:00
 								def test_auto_partition_works_with_unstructured_jsons():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												fix: update `detect_filetype` for JSONs with text/plain MIME type (#520)

* check to see if text file is a json

* add json check into filetype detection

* added test for updated file detection logic

* bytes/strings handling

* changlog and version bump
											
										
										
											2023-04-26 13:52:47 -04:00
+								    assert elements[0].text == "News Around NOAA"
 								def test_auto_partition_works_with_unstructured_jsons_from_file():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
 								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												fix: update `detect_filetype` for JSONs with text/plain MIME type (#520)

* check to see if text file is a json

* add json check into filetype detection

* added test for updated file detection logic

* bytes/strings handling

* changlog and version bump
											
										
										
											2023-04-26 13:52:47 -04:00
+								    assert elements[0].text == "News Around NOAA"
-												feat: add `partition_odt` for open office docs (#548)

* added filetype detection for odt

* add function for partition odt documents

* add odt files to auto

* changelog and version

* docs and readme

* update installation docs

* skip tests if not supported or in docker

* import pytest

* fix docs typos
											
										
										
											2023-05-04 15:28:08 -04:00
 								def test_auto_partition_odt_from_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												chore: adding test case for odt tables (#1434)

ODT table extraction is happening! Just added to an existing example-doc
and an accompanying test case.
											
										
										
											2023-09-16 22:29:44 -07:00
+								    assert elements[0] == Title("Lorem ipsum dolor sit amet.")
-												feat: add `partition_odt` for open office docs (#548)

* added filetype detection for odt

* add function for partition odt documents

* add odt files to auto

* changelog and version

* docs and readme

* update installation docs

* skip tests if not supported or in docker

* import pytest

* fix docs typos
											
										
										
											2023-05-04 15:28:08 -04:00
 								def test_auto_partition_odt_from_file():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
 								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: add `partition_odt` for open office docs (#548)

* added filetype detection for odt

* add function for partition odt documents

* add odt files to auto

* changelog and version

* docs and readme

* update installation docs

* skip tests if not supported or in docker

* import pytest

* fix docs typos
											
										
										
											2023-05-04 15:28:08 -04:00
-												chore: adding test case for odt tables (#1434)

ODT table extraction is happening! Just added to an existing example-doc
and an accompanying test case.
											
										
										
											2023-09-16 22:29:44 -07:00
+								    assert elements[0] == Title("Lorem ipsum dolor sit amet.")
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
 								@pytest.mark.parametrize(
 								    ("content_type", "routing_func", "expected"),
 								    [
-												refactor: simplifies JSON detection and add tests (#975)

* refactor json detection

* version and changelog

* fix mock in test
											
										
										
											2023-07-25 15:59:45 -04:00
+								        ("text/csv", "csv", "text/csv"),
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        ("text/html", "html", "text/html"),
 								        ("jdsfjdfsjkds", "pdf", None),
 								    ],
 								)
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected, monkeypatch):
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								    with patch(
 								        f"unstructured.partition.auto.partition_{routing_func}",
 								        lambda *args, **kwargs: [Text("text 1"), Text("text 2")],
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								    ) as mock_partition:
 								        mock_partition_with_extras_map = {routing_func: mock_partition}
 								        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
 								    assert len(elements) == 2
 								    assert all(el.metadata.filetype == expected for el in elements)
 								@pytest.mark.parametrize(
 								    ("content_type", "expected"),
 								    [
 								        ("application/pdf", FILETYPE_TO_MIMETYPE[FileType.PDF]),
 								        (None, FILETYPE_TO_MIMETYPE[FileType.PDF]),
 								    ],
 								)
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								def test_auto_filetype_overrides_file_specific(content_type, expected, monkeypatch):
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								    pdf_metadata = ElementMetadata(filetype="imapdf")
 								    with patch(
 								        "unstructured.partition.auto.partition_pdf",
 								        lambda *args, **kwargs: [
 								            Text("text 1", metadata=pdf_metadata),
 								            Text("text 2", metadata=pdf_metadata),
 								        ],
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
+								    ) as mock_partition:
 								        mock_partition_with_extras_map = {"pdf": mock_partition}
 								        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
 								    assert len(elements) == 2
 								    assert all(el.metadata.filetype == expected for el in elements)
 								supported_filetypes = [
 								    _
 								    for _ in FileType
 								    if _
 								    not in (
 								        FileType.UNK,
 								        FileType.ZIP,
 								        FileType.XLS,
 								    )
 								]
 								FILETYPE_TO_MODULE = {
 								    FileType.JPG: "image",
 								    FileType.PNG: "image",
 								    FileType.TXT: "text",
 								    FileType.EML: "email",
 								}
 								@pytest.mark.parametrize("filetype", supported_filetypes)
 								def test_file_specific_produces_correct_filetype(filetype: FileType):
-												feat: supports multipage tiff (#1131)

Add test case test_partition_image_with_multipage_tiff that reads multipage TIFF file and

- confirms that the function reads all the pages in the TIFF.

- page number is added to the metadata

This PR is branched from and developed on top of 6d6be99 commit.
											
										
										
											2023-08-24 11:12:50 -04:00
+								    if filetype in (FileType.JPG, FileType.PNG, FileType.TIFF, FileType.EMPTY):
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        pytest.skip()
 								    extension = filetype.name.lower()
 								    filetype_module = (
 								        extension if filetype not in FILETYPE_TO_MODULE else FILETYPE_TO_MODULE[filetype]
 								    )
 								    fun_name = "partition_" + filetype_module
 								    module = import_module(f"unstructured.partition.{filetype_module}")  # noqa
 								    fun = eval(f"module.{fun_name}")
 								    for file in pathlib.Path("example-docs").iterdir():
 								        if file.is_file() and file.suffix == f".{extension}":
 								            elements = fun(str(file))
-												enhancement: handling for empty files in `detect_filetype` and `partition` (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
											
										
										
											2023-06-09 16:07:50 -04:00
+								            assert all(
 								                el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype]
 								                for el in elements
 								                if el.metadata.filetype is not None
 								            )
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								            break
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
-												feat: add `partition_xml` for XML files (#596)

* first pass on partition_xml

* add option to keep xml tags

* added tests for xml

* fix filename

* update filenames

* remove outdated readme

* add xml to auto

* version and changelog

* update readme and docs

* pass through include_metadata

* update include_metadata description

* add README back in

* linting, linting, linting

* more linting

* spooled to bytes doesnt need to be a tuple

* Add tests for newly supported filetypes

* Correct metadata filetype

* doc typo

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* keep_xml_tags -> xml_keep_tags

---------

Co-authored-by: Alan Bertl <alan@unstructured.io>
Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-05-18 11:40:12 -04:00
+								def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"):
-												feat: `partition_xml` infers element type on each leaf node (#1249)

### Summary

Closes #1229. Updates `partition_xml` so that the element type is
inferred on each leaf node when `xml_keep_tags=False` instead of
delegating splitting and partitioning to `partition_xml`. If
`xml_keep_tags=True`, the file is treated like a text file still and
partitioning is still delegated to `partition_text`.

Also adds the option to pass `text` as an input to `partition_xml`.

### Testing

Create a `parrots.xml` file that looks like:

```xml
<xml><parrot><name>Conure</name><description>A conure is a very friendly bird.

Conures are feathery and like to dance.</description></parrot></xml>
```

Run:

```python
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import convert_to_dict

elements = partition_xml(filename="parrots.xml")
convert_to_dict(elements)
```

One `main`, the output is the following. Notice how the `<name>` tag
incorrectly gets merged into `<description>` in the first element.

```python
[{'element_id': '7ae4074435df8dfcefcf24a4e6c52026',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure A conure is a very friendly bird.',
  'type': 'NarrativeText'},
 {'element_id': '859ecb332da6961acd2fb6a0185d1549',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```

One the feature branch, the output is the following, and the tags are
correctly separated.

```python
[{'element_id': '5512218914e4eeacf71a9cd42c373710',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure',
  'type': 'Title'},
 {'element_id': '113bf8d250c2b1a77c9c2caa4b812f85',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'A conure is a very friendly bird.\n'
          '\n'
          'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```
											
										
										
											2023-08-30 17:07:10 -04:00
+								    elements = partition(filename=filename, xml_keep_tags=False, metadata_filename=filename)
-												feat: add `partition_xml` for XML files (#596)

* first pass on partition_xml

* add option to keep xml tags

* added tests for xml

* fix filename

* update filenames

* remove outdated readme

* add xml to auto

* version and changelog

* update readme and docs

* pass through include_metadata

* update include_metadata description

* add README back in

* linting, linting, linting

* more linting

* spooled to bytes doesnt need to be a tuple

* Add tests for newly supported filetypes

* Correct metadata filetype

* doc typo

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* keep_xml_tags -> xml_keep_tags

---------

Co-authored-by: Alan Bertl <alan@unstructured.io>
Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-05-18 11:40:12 -04:00
 								    assert elements[0].text == "United States"
 								    assert elements[0].metadata.filename == "factbook.xml"
 								def test_auto_partition_xml_from_file(filename="example-docs/factbook.xml"):
 								    with open(filename, "rb") as f:
 								        elements = partition(file=f, xml_keep_tags=False)
 								    assert elements[0].text == "United States"
 								def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
 								    elements = partition(filename=filename, xml_keep_tags=True)
-												feat: `partition_xml` infers element type on each leaf node (#1249)

### Summary

Closes #1229. Updates `partition_xml` so that the element type is
inferred on each leaf node when `xml_keep_tags=False` instead of
delegating splitting and partitioning to `partition_xml`. If
`xml_keep_tags=True`, the file is treated like a text file still and
partitioning is still delegated to `partition_text`.

Also adds the option to pass `text` as an input to `partition_xml`.

### Testing

Create a `parrots.xml` file that looks like:

```xml
<xml><parrot><name>Conure</name><description>A conure is a very friendly bird.

Conures are feathery and like to dance.</description></parrot></xml>
```

Run:

```python
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import convert_to_dict

elements = partition_xml(filename="parrots.xml")
convert_to_dict(elements)
```

One `main`, the output is the following. Notice how the `<name>` tag
incorrectly gets merged into `<description>` in the first element.

```python
[{'element_id': '7ae4074435df8dfcefcf24a4e6c52026',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure A conure is a very friendly bird.',
  'type': 'NarrativeText'},
 {'element_id': '859ecb332da6961acd2fb6a0185d1549',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```

One the feature branch, the output is the following, and the tags are
correctly separated.

```python
[{'element_id': '5512218914e4eeacf71a9cd42c373710',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure',
  'type': 'Title'},
 {'element_id': '113bf8d250c2b1a77c9c2caa4b812f85',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'A conure is a very friendly bird.\n'
          '\n'
          'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```
											
										
										
											2023-08-30 17:07:10 -04:00
+								    assert "<leader>Joe Biden</leader>" in elements[0].text
 								    assert elements[0].metadata.filename == "factbook.xml"
-												feat: add `partition_xml` for XML files (#596)

* first pass on partition_xml

* add option to keep xml tags

* added tests for xml

* fix filename

* update filenames

* remove outdated readme

* add xml to auto

* version and changelog

* update readme and docs

* pass through include_metadata

* update include_metadata description

* add README back in

* linting, linting, linting

* more linting

* spooled to bytes doesnt need to be a tuple

* Add tests for newly supported filetypes

* Correct metadata filetype

* doc typo

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* keep_xml_tags -> xml_keep_tags

---------

Co-authored-by: Alan Bertl <alan@unstructured.io>
Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-05-18 11:40:12 -04:00
 								def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
 								    with open(filename, "rb") as f:
 								        elements = partition(file=f, xml_keep_tags=True)
-												feat: `partition_xml` infers element type on each leaf node (#1249)

### Summary

Closes #1229. Updates `partition_xml` so that the element type is
inferred on each leaf node when `xml_keep_tags=False` instead of
delegating splitting and partitioning to `partition_xml`. If
`xml_keep_tags=True`, the file is treated like a text file still and
partitioning is still delegated to `partition_text`.

Also adds the option to pass `text` as an input to `partition_xml`.

### Testing

Create a `parrots.xml` file that looks like:

```xml
<xml><parrot><name>Conure</name><description>A conure is a very friendly bird.

Conures are feathery and like to dance.</description></parrot></xml>
```

Run:

```python
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import convert_to_dict

elements = partition_xml(filename="parrots.xml")
convert_to_dict(elements)
```

One `main`, the output is the following. Notice how the `<name>` tag
incorrectly gets merged into `<description>` in the first element.

```python
[{'element_id': '7ae4074435df8dfcefcf24a4e6c52026',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure A conure is a very friendly bird.',
  'type': 'NarrativeText'},
 {'element_id': '859ecb332da6961acd2fb6a0185d1549',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```

One the feature branch, the output is the following, and the tags are
correctly separated.

```python
[{'element_id': '5512218914e4eeacf71a9cd42c373710',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'Conure',
  'type': 'Title'},
 {'element_id': '113bf8d250c2b1a77c9c2caa4b812f85',
  'metadata': {'file_directory': '/home/matt/tmp',
               'filename': 'parrots.xml',
               'filetype': 'application/xml',
               'last_modified': '2023-08-30T14:21:38'},
  'text': 'A conure is a very friendly bird.\n'
          '\n'
          'Conures are feathery and like to dance.',
  'type': 'NarrativeText'}]

```
											
										
										
											2023-08-30 17:07:10 -04:00
+								    assert "<leader>Joe Biden</leader>" in elements[0].text
-												feat: add `partition_xml` for XML files (#596)

* first pass on partition_xml

* add option to keep xml tags

* added tests for xml

* fix filename

* update filenames

* remove outdated readme

* add xml to auto

* version and changelog

* update readme and docs

* pass through include_metadata

* update include_metadata description

* add README back in

* linting, linting, linting

* more linting

* spooled to bytes doesnt need to be a tuple

* Add tests for newly supported filetypes

* Correct metadata filetype

* doc typo

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* keep_xml_tags -> xml_keep_tags

---------

Co-authored-by: Alan Bertl <alan@unstructured.io>
Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-05-18 11:40:12 -04:00
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 								def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
-												enhancement: Add `include_header` kwarg for xlsx, default True(#1125)

Closes Github issue #1121

Adds include_header kwarg to partition_xlsx and change default behavior to True.
											
										
										
											2023-08-16 23:16:23 -05:00
+								    elements = partition(filename=filename, include_header=False)
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
 								    assert all(isinstance(element, Table) for element in elements)
 								    assert len(elements) == 2
-												feat: partition_tsv for tab separated value files (#758)

* first pass at partition_tsv

* working tests

* create constants for tests and debug `make test` failure

* make check and tidy

* undo changes for testing locally

* update changelog and version

* fix bricks.rst

* refactor if statements

* make tidy

* fix README and change try/except to if/else

* update changelog and version

* fix\ docstring
											
										
										
											2023-06-15 13:50:53 -05:00
+								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 								    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								    assert elements[0].metadata.page_number == 1
 								    assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
 								def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
 								    with open(filename, "rb") as f:
-												enhancement: Add `include_header` kwarg for xlsx, default True(#1125)

Closes Github issue #1121

Adds include_header kwarg to partition_xlsx and change default behavior to True.
											
										
										
											2023-08-16 23:16:23 -05:00
+								        elements = partition(file=f, include_header=False)
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
 								    assert all(isinstance(element, Table) for element in elements)
 								    assert len(elements) == 2
-												feat: partition_tsv for tab separated value files (#758)

* first pass at partition_tsv

* working tests

* create constants for tests and debug `make test` failure

* make check and tidy

* undo changes for testing locally

* update changelog and version

* fix bricks.rst

* refactor if statements

* make tidy

* fix README and change try/except to if/else

* update changelog and version

* fix\ docstring
											
										
										
											2023-06-15 13:50:53 -05:00
+								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 								    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								    assert elements[0].metadata.page_number == 1
 								    assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
-												build(release): bump unstructured-inference (#1074)

* build(release): bump unstructured-inference

Related to downstream issue:
Unstructured-IO/unstructured-api#182

And upstream PR:
Unstructured-IO/unstructured-inference#165

---------

Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
											
										
										
											2023-08-10 13:57:46 -07:00
+								EXPECTED_XLS_TEXT_LEN = 507
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
 								EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MA What C datatypes are 8 bits? (assume i386)"
 								EXPECTED_XLS_TABLE = (
 								    """<table border="1" class="dataframe">
 								  <tbody>
 								    <tr>
 								      <td>MA</td>
 								      <td>What C datatypes are 8 bits? (assume i386)</td>
 								      <td>int</td>
 								      <td></td>
 								      <td>float</td>
 								      <td></td>
 								      <td>double</td>
 								      <td></td>
 								      <td>char</td>
 								    </tr>
 								    <tr>
 								      <td>TF</td>
 								      <td>Bagpipes are awesome.</td>
 								      <td>true</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>ESS</td>
 								      <td>How have the original Henry Hornbostel buildings """
 								    """influenced campus architecture and design in the last 30 years?</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>ORD</td>
 								      <td>Rank the following in their order of operation.</td>
 								      <td>Parentheses</td>
 								      <td>Exponents</td>
 								      <td>Division</td>
 								      <td>Addition</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>FIB</td>
 								      <td>The student activities fee is</td>
 								      <td>95</td>
 								      <td>dollars for students enrolled in</td>
 								      <td>19</td>
 								      <td>units or more,</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>MAT</td>
 								      <td>Match the lower-case greek letter with its capital form.</td>
 								      <td>λ</td>
 								      <td>Λ</td>
 								      <td>α</td>
 								      <td>γ</td>
 								      <td>Γ</td>
 								      <td>φ</td>
 								      <td>Φ</td>
 								    </tr>
 								  </tbody>
 								</table>"""
 								)
-												fix: extract emojis with `partition_xlsx` (#1009)

* 🐛 fixxed emoji xlsx bug

* update version and changelog

* check if beautifulsoup exists

* update docs

* fix html parser call

* fix failing attachment test

* ✅  added emoji test, added requirment fixed dependency

* 🐛 dependency

* 🐛 correct depeendency

* linting, linting, linting

* check for bs4

* skip auto xls filename test

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
											
										
										
											2023-08-04 16:14:08 +02:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
+								def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
-												enhancement: Add `include_header` kwarg for xlsx, default True(#1125)

Closes Github issue #1121

Adds include_header kwarg to partition_xlsx and change default behavior to True.
											
										
										
											2023-08-16 23:16:23 -05:00
+								    elements = partition(filename=filename, include_header=False)
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
 								    assert all(isinstance(element, Table) for element in elements)
 								    assert len(elements) == 3
 								    assert clean_extra_whitespace(elements[0].text)[:45] == EXPECTED_XLS_INITIAL_45_CLEAN_TEXT
-												build(release): bump unstructured-inference (#1074)

* build(release): bump unstructured-inference

Related to downstream issue:
Unstructured-IO/unstructured-api#182

And upstream PR:
Unstructured-IO/unstructured-inference#165

---------

Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
											
										
										
											2023-08-10 13:57:46 -07:00
+								    # NOTE(crag): if the beautifulsoup4 package is installed, some (but not all) additional
 								    # whitespace is removed, so the expected text length is less than is the case
 								    # when beautifulsoup4 is *not* installed. E.g.
 								    # "\n\n\nMA\nWhat C datatypes are 8 bits" vs.
 								    # '\n  \n    \n      MA\n      What C datatypes are 8 bits?... "
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
+								    assert len(elements[0].text) == EXPECTED_XLS_TEXT_LEN
 								    assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE
-												fix: add more mime types for csv (#620)


											
										
										
											2023-05-19 17:40:26 -04:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
+								def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
 								    elements = partition(filename=filename)
-												feat: partition_tsv for tab separated value files (#758)

* first pass at partition_tsv

* working tests

* create constants for tests and debug `make test` failure

* make check and tidy

* undo changes for testing locally

* update changelog and version

* fix bricks.rst

* refactor if statements

* make tidy

* fix README and change try/except to if/else

* update changelog and version

* fix\ docstring
											
										
										
											2023-06-15 13:50:53 -05:00
+								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 								    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
+								    assert elements[0].metadata.filetype == "text/csv"
-												Chore: Pass table support  param to partition image (#973)

* add param and test in image table extraction

* version and changelog

* need to publish this one for api repo

* add new param skip_infer_table_types

* use warning

* clean up with mapping

* add test for tsv

* fix test fail

* weird change from merge

* doc nit

* don't use mapping

* correct conflict
											
										
										
											2023-07-27 13:33:36 -04:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
 								def test_auto_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
 								    elements = partition(filename=filename)
 								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 								    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
 								    assert elements[0].metadata.filetype == "text/tsv"
-												fix: add more mime types for csv (#620)


											
										
										
											2023-05-19 17:40:26 -04:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
+								def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
 								    with open(filename, "rb") as f:
 								        elements = partition(file=f)
-												feat: partition_tsv for tab separated value files (#758)

* first pass at partition_tsv

* working tests

* create constants for tests and debug `make test` failure

* make check and tidy

* undo changes for testing locally

* update changelog and version

* fix bricks.rst

* refactor if statements

* make tidy

* fix README and change try/except to if/else

* update changelog and version

* fix\ docstring
											
										
										
											2023-06-15 13:50:53 -05:00
+								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
+								    assert isinstance(elements[0], Table)
-												feat: partition_tsv for tab separated value files (#758)

* first pass at partition_tsv

* working tests

* create constants for tests and debug `make test` failure

* make check and tidy

* undo changes for testing locally

* update changelog and version

* fix bricks.rst

* refactor if statements

* make tidy

* fix README and change try/except to if/else

* update changelog and version

* fix\ docstring
											
										
										
											2023-06-15 13:50:53 -05:00
+								    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
+								    assert elements[0].metadata.filetype == "text/csv"
-												enhancement: handling for empty files in `detect_filetype` and `partition` (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
											
										
										
											2023-06-09 16:07:50 -04:00
-												feature(html partition): parse pre tag (#642)

* feature(html partition): parse pre tag

* chore: update CHANGELOG.md

* style: black format xml.py

* Added tests dor html with pre tag

* remove skip test, update parse pre tag

* fix style

* chore: spell check

* chore: update changelog & version

* chore: update ingest test fixtures

* chore: add exception handling if `element.text` is `None` in `_read_xml`

* test: add more sanity testing on the `.text` content of the element(s)

* refactor: move the conditional logic for <pre> outside of the `try/except` block

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
											
										
										
											2023-06-27 21:52:39 +03:00
+								def test_auto_partition_html_pre_from_file(filename="example-docs/fake-html-pre.htm"):
 								    elements = partition(filename=filename)
 								    assert len(elements) > 0
-												Avoid setting metadata in constructor signature for elements (#837)

Avoid setting metadata in constructor signature for elements because that can lead to unexpected object reuse (and modification).

Bonus refactor for PageBreak to have text values of "".

---------

Co-authored-by: Alan Bertl <alan@unstructured.io>
Co-authored-by: Crag Wolfe <crag@unstructuredai.io>
											
										
										
											2023-06-28 23:14:05 -04:00
+								    assert "PageBreak" not in [elem.category for elem in elements]
-												fix: respect `<pre>` tag order in `partition_html` (#1197)

### Summary

Closes #1184. Updates `partition_html` to respect the ordering of
`<pre>` tags in HTML documents.

### Testing

The elements in the following example should be in the correct order.

```python
    from unstructured.partition.html import partition_html

    html_text = """
    <pre>The Big Brown Bear</pre>
    <div>The big brown bear is growling.</div>
    <pre>The big brown bear is sleeping.</pre>
    <div>The Big Blue Bear</div>
    """
    elements = partition_html(text=html_text)
    print("\n\n".join([str(el) for el in elements]))
```
											
										
										
											2023-08-25 00:14:48 -04:00
+								    assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
 								    assert isinstance(elements[0], NarrativeText)
-												feature(html partition): parse pre tag (#642)

* feature(html partition): parse pre tag

* chore: update CHANGELOG.md

* style: black format xml.py

* Added tests dor html with pre tag

* remove skip test, update parse pre tag

* fix style

* chore: spell check

* chore: update changelog & version

* chore: update ingest test fixtures

* chore: add exception handling if `element.text` is `None` in `_read_xml`

* test: add more sanity testing on the `.text` content of the element(s)

* refactor: move the conditional logic for <pre> outside of the `try/except` block

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
											
										
										
											2023-06-27 21:52:39 +03:00
+								    assert elements[0].metadata.filetype == "text/html"
 								    assert elements[0].metadata.filename == "fake-html-pre.htm"
-												enhancement: handling for empty files in `detect_filetype` and `partition` (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
											
										
										
											2023-06-09 16:07:50 -04:00
+								def test_auto_partition_works_on_empty_filename(filename="example-docs/empty.txt"):
 								    assert partition(filename=filename) == []
 								def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
 								    with open(filename, "rb") as f:
 								        assert partition(file=f) == []
-												feat: `partition_rst` for ReStructured Text documents (#725)

* add example rst file

* filetype detection for rst files

* add partition_rst function

* add partition_rst to auto

* update readme

* update docs

* changelog and version

* pandocs -> pandoc

* fix typo
											
										
										
											2023-06-12 15:31:10 -04:00
-												feat: partition_org for Org Mode documents (#780)

* feat: partition_org for Org Mode documents

* update version
											
										
										
											2023-06-23 20:45:31 +02:00
+								def test_auto_partition_org_from_filename(filename="example-docs/README.org"):
 								    elements = partition(filename=filename)
 								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/org"
 								def test_auto_partition_org_from_file(filename="example-docs/README.org"):
 								    with open(filename, "rb") as f:
 								        elements = partition(file=f, content_type="text/org")
 								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/org"
-												feat: `partition_rst` for ReStructured Text documents (#725)

* add example rst file

* filetype detection for rst files

* add partition_rst function

* add partition_rst to auto

* update readme

* update docs

* changelog and version

* pandocs -> pandoc

* fix typo
											
										
										
											2023-06-12 15:31:10 -04:00
+								def test_auto_partition_rst_from_filename(filename="example-docs/README.rst"):
 								    elements = partition(filename=filename)
 								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/x-rst"
 								def test_auto_partition_rst_from_file(filename="example-docs/README.rst"):
 								    with open(filename, "rb") as f:
 								        elements = partition(file=f, content_type="text/x-rst")
 								    assert elements[0] == Title("Example Docs")
 								    assert elements[0].metadata.filetype == "text/x-rst"
-												fixed filename metadata bug when using file and file_filename (#1002)


											
										
										
											2023-08-02 18:14:15 -07:00
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								def test_auto_partition_metadata_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
 								    with open(filename) as f:
 								        elements = partition(file=f, metadata_filename=filename)
 								    assert elements[0].metadata.filename == os.path.split(filename)[-1]
 								def test_auto_partition_warns_about_file_filename_deprecation(caplog):
-												fixed filename metadata bug when using file and file_filename (#1002)


											
										
										
											2023-08-02 18:14:15 -07:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
 								    with open(filename) as f:
 								        elements = partition(file=f, file_filename=filename)
 								    assert elements[0].metadata.filename == os.path.split(filename)[-1]
-												chore: deprecation warning for `file_filename` (#1191)

### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
											
										
										
											2023-08-24 03:02:47 -04:00
+								    assert "WARNING" in caplog.text
 								    assert "The file_filename kwarg will be deprecated" in caplog.text
 								def test_auto_partition_raises_with_file_and_metadata_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
 								    with open(filename) as f, pytest.raises(ValueError):
 								        partition(file=f, file_filename=filename, metadata_filename=filename)
-												enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
											
										
										
											2023-08-21 23:00:21 -04:00
 								def test_get_partition_with_extras_prompts_for_install_if_missing():
 								    partition_with_extras_map = {}
 								    with pytest.raises(ImportError) as exception_info:
 								        _get_partition_with_extras("pdf", partition_with_extras_map)
 								    msg = str(exception_info.value)
 								    assert 'Install the pdf dependencies with pip install "unstructured[pdf]"' in msg
-												chunk_by_title decorator (#1304)

### Summary

Partial solution to #1185.
Related to #1222.
Creates decorator from `chunk_by_title` cleaning brick.
Breaks a document into sections based on the presence of Title elements.
Also starts a new section under the following conditions:

- If metadata changes, indicating a change in section or page or a
switch to processing attachments. If `multipage_sections=True`, sections
can span pages. `multipage_sections` defaults to True.
- If the length of the section exceeds `new_after_n_chars` characters.
The default is 1500. The **chunking function does not split individual
elements**, so it's possible for a section to exceed that threshold if
an individual element if over `new_after_n_chars characters`, which
could occur with a long NarrativeText element.

Combines sections under these conditions
- Sections under `combine_under_n_chars` characters are combined. The
default is 500.

### Testing

from unstructured.partition.html import partition_html

url = "https://understandingwar.org/backgrounder/russian-offensive-campaign-assessment-august-27-2023-0"
chunks = partition_html(url=url, chunking_strategy="by_title")

for chunk in chunks:
    print(chunk)
    print("\n\n" + "-"*80)
    input()

											
										
										
											2023-09-11 16:00:14 -05:00
 								def test_add_chunking_strategy_on_partition_auto():
 								    filename = "example-docs/example-10k-1p.html"
 								    chunk_elements = partition(filename, chunking_strategy="by_title")
 								    elements = partition(filename)
 								    chunks = chunk_by_title(elements)
 								    assert chunk_elements != elements
 								    assert chunk_elements == chunks
 								def test_add_chunking_strategy_on_partition_auto_respects_multipage():
 								    filename = "example-docs/example-10k-1p.html"
 								    partitioned_elements_multipage_false_combine_chars_0 = partition(
 								        filename,
 								        chunking_strategy="by_title",
 								        multipage_sections=False,
 								        combine_under_n_chars=0,
 								    )
 								    partitioned_elements_multipage_true_combine_chars_0 = partition(
 								        filename,
 								        chunking_strategy="by_title",
 								        multipage_sections=True,
 								        combine_under_n_chars=0,
 								    )
 								    elements = partition(filename)
 								    cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title(
 								        elements,
 								        multipage_sections=False,
 								        combine_under_n_chars=0,
 								    )
 								    cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title(
 								        elements,
 								        multipage_sections=True,
 								        combine_under_n_chars=0,
 								    )
 								    assert (
 								        partitioned_elements_multipage_false_combine_chars_0
 								        == cleaned_elements_multipage_false_combine_chars_0
 								    )
 								    assert (
 								        partitioned_elements_multipage_true_combine_chars_0
 								        == cleaned_elements_multipage_true_combine_chars_0
 								    )
 								    assert len(partitioned_elements_multipage_true_combine_chars_0) != len(
 								        partitioned_elements_multipage_false_combine_chars_0,
 								    )