unstructured/test_unstructured/partition/test_auto.py

import json
import os
import pathlib
import warnings
from importlib import import_module
from unittest.mock import patch

import docx
import pytest

from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import (
    Address,
    ElementMetadata,
    ListItem,
    NarrativeText,
    PageBreak,
    Table,
    Text,
    Title,
)
from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType
from unstructured.partition import auto
from unstructured.partition.auto import partition
from unstructured.partition.common import convert_office_doc
from unstructured.staging.base import elements_to_json

DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")

EXPECTED_EMAIL_OUTPUT = [
    NarrativeText(text="This is a test email to use for unit tests."),
    Title(text="Important points:"),
    ListItem(text="Roses are red"),
    ListItem(text="Violets are blue"),
]

is_in_docker = os.path.exists("/.dockerenv")


def test_auto_partition_email_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
    elements = partition(filename=filename, strategy="hi_res")
    assert len(elements) > 0
    assert elements == EXPECTED_EMAIL_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]


def test_auto_partition_email_from_file():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
    with open(filename) as f:
        elements = partition(file=f, strategy="hi_res")
    assert len(elements) > 0
    assert elements == EXPECTED_EMAIL_OUTPUT


def test_auto_partition_email_from_file_rb():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
    with open(filename, "rb") as f:
        elements = partition(file=f, strategy="hi_res")
    assert len(elements) > 0
    assert elements == EXPECTED_EMAIL_OUTPUT


@pytest.fixture()
def mock_docx_document():
    document = docx.Document()

    document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
    # NOTE(robinson) - this should get picked up as a list item due to the •
    document.add_paragraph("• Parrots", style="Normal")
    document.add_paragraph("Hockey", style="List Bullet")
    # NOTE(robinson) - this should get picked up as a title
    document.add_paragraph("Analysis", style="Normal")
    # NOTE(robinson) - this should get dropped because it is empty
    document.add_paragraph("", style="Normal")
    # NOTE(robinson) - this should get picked up as a narrative text
    document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
    document.add_paragraph("This is my third thought.", style="Body Text")
    # NOTE(robinson) - this should just be regular text
    document.add_paragraph("2023")

    return document


@pytest.fixture()
def expected_docx_elements():
    return [
        Title("These are a few of my favorite things:"),
        ListItem("Parrots"),
        ListItem("Hockey"),
        Title("Analysis"),
        NarrativeText("This is my first thought. This is my second thought."),
        NarrativeText("This is my third thought."),
        Text("2023"),
    ]


def test_auto_partition_docx_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    mock_docx_document.save(filename)

    elements = partition(filename=filename, strategy="hi_res")
    assert elements == expected_docx_elements
    assert elements[0].metadata.filename == os.path.basename(filename)


def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_elements, tmpdir):
    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    mock_docx_document.save(filename)

    with open(filename, "rb") as f:
        elements = partition(file=f, strategy="hi_res")
    assert elements == expected_docx_elements


@pytest.mark.parametrize(
    ("pass_file_filename", "content_type"),
    [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
)
def test_auto_partition_doc_with_filename(
    mock_docx_document,
    expected_docx_elements,
    tmpdir,
    pass_file_filename,
    content_type,
):
    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
    mock_docx_document.save(docx_filename)
    convert_office_doc(docx_filename, tmpdir.dirname, "doc")
    file_filename = doc_filename if pass_file_filename else None
    elements = partition(
        filename=doc_filename,
        file_filename=file_filename,
        content_type=content_type,
        strategy="hi_res",
    )
    assert elements == expected_docx_elements
    assert elements[0].metadata.filename == "mock_document.doc"
    assert elements[0].metadata.file_directory == tmpdir.dirname


# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
# determine that the file is an .doc document
@pytest.mark.xfail()
def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements, tmpdir):
    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
    mock_docx_document.save(docx_filename)
    convert_office_doc(docx_filename, tmpdir.dirname, "doc")

    with open(doc_filename, "rb") as f:
        elements = partition(file=f, strategy="hi_res")
    assert elements == expected_docx_elements


@pytest.mark.parametrize(
    ("pass_file_filename", "content_type"),
    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_filename(pass_file_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
    file_filename = filename if pass_file_filename else None
    elements = partition(
        filename=filename,
        file_filename=file_filename,
        content_type=content_type,
        strategy="hi_res",
    )
    assert len(elements) > 0
    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]


@pytest.mark.parametrize(
    ("pass_file_filename", "content_type"),
    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_file(pass_file_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
    file_filename = filename if pass_file_filename else None
    with open(filename) as f:
        elements = partition(
            file=f,
            file_filename=file_filename,
            content_type=content_type,
            strategy="hi_res",
        )
    assert len(elements) > 0


def test_auto_partition_html_from_file_rb():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
    with open(filename, "rb") as f:
        elements = partition(file=f, strategy="hi_res")
    assert len(elements) > 0


def test_auto_partition_json_from_filename():
    """Test auto-processing an unstructured json output file by filename."""
    filename = os.path.join(
        EXAMPLE_DOCS_DIRECTORY,
        "..",
        "test_unstructured_ingest",
        "expected-structured-output",
        "azure-blob-storage",
        "spring-weather.html.json",
    )
    with open(filename) as json_f:
        json_data = json.load(json_f)
    json_elems = json.loads(elements_to_json(partition(filename=filename, strategy="hi_res")))
    for elem in json_elems:
        # coordinates are always in the element data structures, even if None
        elem.pop("coordinates")
        elem.pop("metadata")
    for elem in json_data:
        elem.pop("metadata")
    assert json_data == json_elems


@pytest.mark.xfail(
    reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492",
)
def test_auto_partition_json_from_file():
    """Test auto-processing an unstructured json output file by file handle."""
    filename = os.path.join(
        EXAMPLE_DOCS_DIRECTORY,
        "..",
        "test_unstructured_ingest",
        "expected-structured-output",
        "azure-blob-storage",
        "spring-weather.html.json",
    )
    with open(filename) as json_f:
        json_data = json.load(json_f)
    with open(filename, encoding="utf-8") as partition_f:
        json_elems = json.loads(elements_to_json(partition(file=partition_f, strategy="hi_res")))
    for elem in json_elems:
        # coordinates are always in the element data structures, even if None
        elem.pop("coordinates")
    assert json_data == json_elems


EXPECTED_TEXT_OUTPUT = [
    NarrativeText(text="This is a test document to use for unit tests."),
    Address(text="Doylestown, PA 18901"),
    Title(text="Important points:"),
    ListItem(text="Hamburgers are delicious"),
    ListItem(text="Dogs are the best"),
    ListItem(text="I love fuzzy blankets"),
]


def test_auto_partition_text_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    elements = partition(filename=filename, strategy="hi_res")
    assert len(elements) > 0
    assert elements == EXPECTED_TEXT_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]


def test_auto_partition_text_from_file():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    with open(filename) as f:
        elements = partition(file=f, strategy="hi_res")
    assert len(elements) > 0
    assert elements == EXPECTED_TEXT_OUTPUT


@pytest.mark.parametrize(
    ("pass_file_filename", "content_type"),
    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_filename(pass_file_filename, content_type, request):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    file_filename = filename if pass_file_filename else None

    elements = partition(
        filename=filename,
        file_filename=file_filename,
        content_type=content_type,
        strategy="hi_res",
    )

    assert isinstance(elements[0], Title)
    assert elements[0].text.startswith("LayoutParser")

    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]

    # NOTE(alan): Xfail since new model skips the word Zejiang
    request.applymarker(pytest.mark.xfail)

    assert isinstance(elements[1], NarrativeText)
    assert elements[1].text.startswith("Zejiang Shen")


def test_auto_partition_pdf_uses_table_extraction():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    with patch(
        "unstructured_inference.inference.layout.process_file_with_model",
    ) as mock_process_file_with_model:
        partition(filename, pdf_infer_table_structure=True, strategy="hi_res")
        assert mock_process_file_with_model.call_args[1]["extract_tables"]


def test_auto_partition_pdf_with_fast_strategy():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")

    mock_return = [NarrativeText("Hello there!")]
    with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
        partition(filename=filename, strategy="fast")

    mock_partition.assert_called_once_with(
        filename=filename,
        file=None,
        url=None,
        include_page_breaks=False,
        infer_table_structure=False,
        strategy="fast",
        ocr_languages="eng",
    )


@pytest.mark.parametrize(
    ("pass_file_filename", "content_type"),
    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_file(pass_file_filename, content_type, request):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    file_filename = filename if pass_file_filename else None

    with open(filename, "rb") as f:
        elements = partition(
            file=f,
            file_filename=file_filename,
            content_type=content_type,
            strategy="hi_res",
        )

    assert isinstance(elements[0], Title)
    assert elements[0].text.startswith("LayoutParser")

    # NOTE(alan): Xfail since new model misses the first word Zejiang
    request.applymarker(pytest.mark.xfail)

    assert isinstance(elements[1], NarrativeText)
    assert elements[1].text.startswith("Zejiang Shen")


def test_partition_pdf_doesnt_raise_warning():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    # NOTE(robinson): This is the recommended way to check that no warning is emitted,
    # per the pytest docs.
    # ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
    #      #additional-use-cases-of-warnings-in-tests
    with warnings.catch_warnings():
        warnings.simplefilter("error")
        partition(filename=filename, strategy="hi_res")


@pytest.mark.parametrize(
    ("pass_file_filename", "content_type"),
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpg(pass_file_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
    file_filename = filename if pass_file_filename else None
    elements = partition(
        filename=filename,
        file_filename=file_filename,
        content_type=content_type,
        strategy="auto",
    )
    assert len(elements) > 0


@pytest.mark.parametrize(
    ("pass_file_filename", "content_type"),
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpg_from_file(pass_file_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
    file_filename = filename if pass_file_filename else None
    with open(filename, "rb") as f:
        elements = partition(
            file=f,
            file_filename=file_filename,
            content_type=content_type,
            strategy="auto",
        )
    assert len(elements) > 0


def test_auto_partition_raises_with_bad_type(monkeypatch):
    monkeypatch.setattr(auto, "detect_filetype", lambda *args, **kwargs: None)
    with pytest.raises(ValueError):
        partition(filename="made-up.fake", strategy="hi_res")


EXPECTED_PPTX_OUTPUT = [
    Title(text="Adding a Bullet Slide"),
    ListItem(text="Find the bullet slide layout"),
    ListItem(text="Use _TextFrame.text for first bullet"),
    ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
    NarrativeText(text="Here is a lot of text!"),
    NarrativeText(text="Here is some text in a text box!"),
]


def test_auto_partition_pptx_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
    elements = partition(filename=filename, strategy="hi_res")
    assert elements == EXPECTED_PPTX_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_ppt_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
    elements = partition(filename=filename, strategy="hi_res")
    assert elements == EXPECTED_PPTX_OUTPUT
    assert elements[0].metadata.filename == os.path.basename(filename)
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]


def test_auto_with_page_breaks():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    elements = partition(filename=filename, include_page_breaks=True, strategy="hi_res")
    assert PageBreak() in elements


def test_auto_partition_epub_from_filename():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
    elements = partition(filename=filename, strategy="hi_res")
    assert len(elements) > 0
    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")


def test_auto_partition_epub_from_file():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
    with open(filename, "rb") as f:
        elements = partition(file=f, strategy="hi_res")
    assert len(elements) > 0
    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")


EXPECTED_MSG_OUTPUT = [
    NarrativeText(text="This is a test email to use for unit tests."),
    Title(text="Important points:"),
    ListItem(text="Roses are red"),
    ListItem(text="Violets are blue"),
]


def test_auto_partition_msg_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
    elements = partition(filename=filename, strategy="hi_res")
    assert elements == EXPECTED_MSG_OUTPUT


def test_auto_partition_rtf_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
    elements = partition(filename=filename, strategy="hi_res")
    assert elements[0] == Title("My First Heading")


def test_auto_partition_from_url():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
    elements = partition(url=url, content_type="text/plain", strategy="hi_res")
    assert elements[0] == Title("Apache License")
    assert elements[0].metadata.url == url


def test_partition_md_works_with_embedded_html():
    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
    elements = partition(url=url, content_type="text/markdown", strategy="hi_res")
    elements[0].text
    unstructured_found = False
    for element in elements:
        if "unstructured" in elements[0].text:
            unstructured_found = True
            break
    assert unstructured_found is True


def test_auto_partition_warns_if_header_set_and_not_url(caplog):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
    partition(filename=filename, headers={"Accept": "application/pdf"}, strategy="hi_res")
    assert caplog.records[0].levelname == "WARNING"


def test_auto_partition_works_with_unstructured_jsons():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
    elements = partition(filename=filename, strategy="hi_res")
    assert elements[0].text == "News Around NOAA"


def test_auto_partition_works_with_unstructured_jsons_from_file():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
    with open(filename, "rb") as f:
        elements = partition(file=f, strategy="hi_res")
    assert elements[0].text == "News Around NOAA"


def test_auto_partition_odt_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
    elements = partition(filename=filename, strategy="hi_res")
    assert elements == [Title("Lorem ipsum dolor sit amet.")]


def test_auto_partition_odt_from_file():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
    with open(filename, "rb") as f:
        elements = partition(file=f, strategy="hi_res")

    assert elements == [Title("Lorem ipsum dolor sit amet.")]


@pytest.mark.parametrize(
    ("content_type", "routing_func", "expected"),
    [
        ("application/json", "json", "application/json"),
        ("text/html", "html", "text/html"),
        ("jdsfjdfsjkds", "pdf", None),
    ],
)
def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected):
    with patch(
        f"unstructured.partition.auto.partition_{routing_func}",
        lambda *args, **kwargs: [Text("text 1"), Text("text 2")],
    ):
        elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
    assert len(elements) == 2
    assert all(el.metadata.filetype == expected for el in elements)


@pytest.mark.parametrize(
    ("content_type", "expected"),
    [
        ("application/pdf", FILETYPE_TO_MIMETYPE[FileType.PDF]),
        (None, FILETYPE_TO_MIMETYPE[FileType.PDF]),
    ],
)
def test_auto_filetype_overrides_file_specific(content_type, expected):
    pdf_metadata = ElementMetadata(filetype="imapdf")
    with patch(
        "unstructured.partition.auto.partition_pdf",
        lambda *args, **kwargs: [
            Text("text 1", metadata=pdf_metadata),
            Text("text 2", metadata=pdf_metadata),
        ],
    ):
        elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
    assert len(elements) == 2
    assert all(el.metadata.filetype == expected for el in elements)


supported_filetypes = [
    _
    for _ in FileType
    if _
    not in (
        FileType.UNK,
        FileType.ZIP,
        FileType.XLS,
    )
]


FILETYPE_TO_MODULE = {
    FileType.JPG: "image",
    FileType.PNG: "image",
    FileType.TXT: "text",
    FileType.EML: "email",
}


@pytest.mark.parametrize("filetype", supported_filetypes)
def test_file_specific_produces_correct_filetype(filetype: FileType):
    if filetype in (FileType.JPG, FileType.PNG, FileType.EMPTY):
        pytest.skip()
    extension = filetype.name.lower()
    filetype_module = (
        extension if filetype not in FILETYPE_TO_MODULE else FILETYPE_TO_MODULE[filetype]
    )
    fun_name = "partition_" + filetype_module
    module = import_module(f"unstructured.partition.{filetype_module}")  # noqa
    fun = eval(f"module.{fun_name}")
    for file in pathlib.Path("example-docs").iterdir():
        if file.is_file() and file.suffix == f".{extension}":
            elements = fun(str(file))
            assert all(
                el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype]
                for el in elements
                if el.metadata.filetype is not None
            )
            break


def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"):
    elements = partition(filename=filename, xml_keep_tags=False)

    assert elements[0].text == "United States"
    assert elements[0].metadata.filename == "factbook.xml"


def test_auto_partition_xml_from_file(filename="example-docs/factbook.xml"):
    with open(filename, "rb") as f:
        elements = partition(file=f, xml_keep_tags=False)

    assert elements[0].text == "United States"


def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
    elements = partition(filename=filename, xml_keep_tags=True)

    assert elements[5].text == "<name>United States</name>"
    assert elements[5].metadata.filename == "factbook.xml"


def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
    with open(filename, "rb") as f:
        elements = partition(file=f, xml_keep_tags=True)

    assert elements[5].text == "<name>United States</name>"


EXPECTED_XLSX_TABLE = """<table border="1" class="dataframe">
  <tbody>
    <tr>
      <td>Team</td>
      <td>Location</td>
      <td>Stanley Cups</td>
    </tr>
    <tr>
      <td>Blues</td>
      <td>STL</td>
      <td>1</td>
    </tr>
    <tr>
      <td>Flyers</td>
      <td>PHI</td>
      <td>2</td>
    </tr>
    <tr>
      <td>Maple Leafs</td>
      <td>TOR</td>
      <td>13</td>
    </tr>
  </tbody>
</table>"""


EXPECTED_XLSX_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"

EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"


def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
    elements = partition(filename=filename)

    assert all(isinstance(element, Table) for element in elements)
    assert len(elements) == 2

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
    assert elements[0].metadata.page_number == 1
    assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE


def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
    with open(filename, "rb") as f:
        elements = partition(file=f)

    assert all(isinstance(element, Table) for element in elements)
    assert len(elements) == 2

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
    assert elements[0].metadata.page_number == 1
    assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE


EXPECTED_XLS_TEXT_LEN = 883

EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MA What C datatypes are 8 bits? (assume i386)"

EXPECTED_XLS_TABLE = (
    """<table border="1" class="dataframe">
  <tbody>
    <tr>
      <td>MA</td>
      <td>What C datatypes are 8 bits? (assume i386)</td>
      <td>int</td>
      <td></td>
      <td>float</td>
      <td></td>
      <td>double</td>
      <td></td>
      <td>char</td>
    </tr>
    <tr>
      <td>TF</td>
      <td>Bagpipes are awesome.</td>
      <td>true</td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>ESS</td>
      <td>How have the original Henry Hornbostel buildings """
    """influenced campus architecture and design in the last 30 years?</td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>ORD</td>
      <td>Rank the following in their order of operation.</td>
      <td>Parentheses</td>
      <td>Exponents</td>
      <td>Division</td>
      <td>Addition</td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>FIB</td>
      <td>The student activities fee is</td>
      <td>95</td>
      <td>dollars for students enrolled in</td>
      <td>19</td>
      <td>units or more,</td>
      <td></td>
      <td></td>
      <td></td>
    </tr>
    <tr>
      <td>MAT</td>
      <td>Match the lower-case greek letter with its capital form.</td>
      <td>λ</td>
      <td>Λ</td>
      <td>α</td>
      <td>γ</td>
      <td>Γ</td>
      <td>φ</td>
      <td>Φ</td>
    </tr>
  </tbody>
</table>"""
)


def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
    elements = partition(filename=filename)

    assert all(isinstance(element, Table) for element in elements)
    assert len(elements) == 3

    assert clean_extra_whitespace(elements[0].text)[:45] == EXPECTED_XLS_INITIAL_45_CLEAN_TEXT
    assert len(elements[0].text) == EXPECTED_XLS_TEXT_LEN
    assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
    elements = partition(filename=filename)

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
    assert elements[0].metadata.filetype == "text/csv"


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
    with open(filename, "rb") as f:
        elements = partition(file=f)

    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
    assert isinstance(elements[0], Table)
    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
    assert elements[0].metadata.filetype == "text/csv"


def test_auto_partition_works_on_empty_filename(filename="example-docs/empty.txt"):
    assert partition(filename=filename) == []


def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
    with open(filename, "rb") as f:
        assert partition(file=f) == []
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								import json
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								import os
 								import pathlib
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
+								import warnings
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								from importlib import import_module
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								from unittest.mock import patch
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								import docx
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								import pytest
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								from unstructured.cleaners.core import clean_extra_whitespace
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								from unstructured.documents.elements import (
 								    Address,
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								    ElementMetadata,
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ListItem,
 								    NarrativeText,
 								    PageBreak,
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								    Table,
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    Text,
 								    Title,
 								)
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								from unstructured.partition import auto
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								from unstructured.partition.auto import partition
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
+								from unstructured.partition.common import convert_office_doc
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								from unstructured.staging.base import elements_to_json
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								DIRECTORY = pathlib.Path(__file__).parent.resolve()
 								EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								EXPECTED_EMAIL_OUTPUT = [
 								    NarrativeText(text="This is a test email to use for unit tests."),
 								    Title(text="Important points:"),
 								    ListItem(text="Roses are red"),
 								    ListItem(text="Violets are blue"),
 								]
-												fix: correct order of kwargs in pandoc (#421)

* fix: correct order of kwargs in pandoc

* only skip epub tests in Docker

* changelog

---------

Co-authored-by: Crag Wolfe <crag@unstructuredai.io>
Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-30 16:54:29 -04:00
+								is_in_docker = os.path.exists("/.dockerenv")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								def test_auto_partition_email_from_filename():
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_EMAIL_OUTPUT
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								def test_auto_partition_email_from_file():
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    with open(filename) as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_EMAIL_OUTPUT
 								def test_auto_partition_email_from_file_rb():
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_EMAIL_OUTPUT
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								@pytest.fixture()
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								def mock_docx_document():
 								    document = docx.Document()
 								    document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
 								    # NOTE(robinson) - this should get picked up as a list item due to the •
 								    document.add_paragraph("• Parrots", style="Normal")
 								    document.add_paragraph("Hockey", style="List Bullet")
 								    # NOTE(robinson) - this should get picked up as a title
 								    document.add_paragraph("Analysis", style="Normal")
 								    # NOTE(robinson) - this should get dropped because it is empty
 								    document.add_paragraph("", style="Normal")
 								    # NOTE(robinson) - this should get picked up as a narrative text
 								    document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
 								    document.add_paragraph("This is my third thought.", style="Body Text")
 								    # NOTE(robinson) - this should just be regular text
 								    document.add_paragraph("2023")
 								    return document
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								@pytest.fixture()
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								def expected_docx_elements():
 								    return [
 								        Title("These are a few of my favorite things:"),
 								        ListItem("Parrots"),
 								        ListItem("Hockey"),
 								        Title("Analysis"),
 								        NarrativeText("This is my first thought. This is my second thought."),
 								        NarrativeText("This is my third thought."),
 								        Text("2023"),
 								    ]
 								def test_auto_partition_docx_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
 								    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
 								    mock_docx_document.save(filename)
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert elements == expected_docx_elements
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
 								def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_elements, tmpdir):
 								    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
 								    mock_docx_document.save(filename)
 								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert elements == expected_docx_elements
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
 								    ("pass_file_filename", "content_type"),
 								    [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
 								)
 								def test_auto_partition_doc_with_filename(
 								    mock_docx_document,
 								    expected_docx_elements,
 								    tmpdir,
 								    pass_file_filename,
 								    content_type,
 								):
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
+								    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
 								    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
 								    mock_docx_document.save(docx_filename)
 								    convert_office_doc(docx_filename, tmpdir.dirname, "doc")
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    file_filename = doc_filename if pass_file_filename else None
 								    elements = partition(
 								        filename=doc_filename,
 								        file_filename=file_filename,
 								        content_type=content_type,
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        strategy="hi_res",
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    )
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
+								    assert elements == expected_docx_elements
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == "mock_document.doc"
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == tmpdir.dirname
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
 								# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
 								# determine that the file is an .doc document
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								@pytest.mark.xfail()
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
+								def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements, tmpdir):
 								    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
 								    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
 								    mock_docx_document.save(docx_filename)
 								    convert_office_doc(docx_filename, tmpdir.dirname, "doc")
 								    with open(doc_filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: add `partition_doc` for `.doc` files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
											
										
										
											2023-02-17 09:30:23 -05:00
+								    assert elements == expected_docx_elements
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
 								    ("pass_file_filename", "content_type"),
 								    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
 								)
 								def test_auto_partition_html_from_filename(pass_file_filename, content_type):
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    file_filename = filename if pass_file_filename else None
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(
 								        filename=filename,
 								        file_filename=file_filename,
 								        content_type=content_type,
 								        strategy="hi_res",
 								    )
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
 								    ("pass_file_filename", "content_type"),
 								    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
 								)
 								def test_auto_partition_html_from_file(pass_file_filename, content_type):
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    file_filename = filename if pass_file_filename else None
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    with open(filename) as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(
 								            file=f,
 								            file_filename=file_filename,
 								            content_type=content_type,
 								            strategy="hi_res",
 								        )
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
 								def test_auto_partition_html_from_file_rb():
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    assert len(elements) > 0
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								def test_auto_partition_json_from_filename():
 								    """Test auto-processing an unstructured json output file by filename."""
 								    filename = os.path.join(
 								        EXAMPLE_DOCS_DIRECTORY,
 								        "..",
 								        "test_unstructured_ingest",
 								        "expected-structured-output",
 								        "azure-blob-storage",
 								        "spring-weather.html.json",
 								    )
 								    with open(filename) as json_f:
 								        json_data = json.load(json_f)
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    json_elems = json.loads(elements_to_json(partition(filename=filename, strategy="hi_res")))
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								    for elem in json_elems:
 								        # coordinates are always in the element data structures, even if None
 								        elem.pop("coordinates")
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        elem.pop("metadata")
 								    for elem in json_data:
 								        elem.pop("metadata")
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								    assert json_data == json_elems
 								@pytest.mark.xfail(
 								    reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492",
 								)
 								def test_auto_partition_json_from_file():
 								    """Test auto-processing an unstructured json output file by file handle."""
 								    filename = os.path.join(
 								        EXAMPLE_DOCS_DIRECTORY,
 								        "..",
 								        "test_unstructured_ingest",
 								        "expected-structured-output",
 								        "azure-blob-storage",
 								        "spring-weather.html.json",
 								    )
 								    with open(filename) as json_f:
 								        json_data = json.load(json_f)
 								    with open(filename, encoding="utf-8") as partition_f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        json_elems = json.loads(elements_to_json(partition(file=partition_f, strategy="hi_res")))
-												fix: workaround .json file detection with old libmagic installs (#493)

Fixes issue where .json files were recognized as "text/plain" rather than "application/json on
the Unstructured image (and other installs that may have an older libmagic).

Also adds missing json auto partition tests.

Including an xfail test for #492 .
											
										
										
											2023-04-17 23:11:21 -07:00
+								    for elem in json_elems:
 								        # coordinates are always in the element data structures, even if None
 								        elem.pop("coordinates")
 								    assert json_data == json_elems
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
+								EXPECTED_TEXT_OUTPUT = [
 								    NarrativeText(text="This is a test document to use for unit tests."),
-												fix: cleanup from live `.docx` tests (#177)

* add env var for cap threshold; raise default threshold

* update docs and tests

* added check for ending in a comma

* update docs

* no caps check for all upper text

* capture Text in html and text

* check category in Text equality check

* lower case all caps before checking for verbs

* added check for us city/state/zip

* added address type

* add address to html

* add address to text

* fix for text tests; escape for large text segments

* refactor regex for readability

* update comment

* additional test for text with linebreaks

* update docs

* update changelog

* update elements docs

* remove old comment

* case -> cast

* type fix
											
										
										
											2023-01-26 10:52:25 -05:00
+								    Address(text="Doylestown, PA 18901"),
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
+								    Title(text="Important points:"),
 								    ListItem(text="Hamburgers are delicious"),
 								    ListItem(text="Dogs are the best"),
 								    ListItem(text="I love fuzzy blankets"),
 								]
 								def test_auto_partition_text_from_filename():
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_TEXT_OUTPUT
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
 								def test_auto_partition_text_from_file():
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    with open(filename) as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: add support for `.txt` files in `partition` (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
											
										
										
											2023-01-13 16:39:53 -05:00
+								    assert len(elements) > 0
 								    assert elements == EXPECTED_TEXT_OUTPUT
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
 								    ("pass_file_filename", "content_type"),
 								    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
 								)
-												build(deps): update inference version (#662)

Updated to the the latest version of unstructured-inference. detectron2 now gets implemented with onnxruntime, yay!

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
											
										
										
											2023-05-31 13:50:15 -05:00
+								def test_auto_partition_pdf_from_filename(pass_file_filename, content_type, request):
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    file_filename = filename if pass_file_filename else None
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(
 								        filename=filename,
 								        file_filename=file_filename,
 								        content_type=content_type,
 								        strategy="hi_res",
 								    )
-												chore: return `Element` objects in `partition_pdf` and `partition_image` (#164)

* helper function to convert to element

* test for element types

* fix for healthcheck url

* version bump

* note on coordinates

* mention FigureCaption

* test_shared -> test_common

* add check boxes for checkbox template

* update changelog
											
										
										
											2023-01-19 09:29:28 -05:00
 								    assert isinstance(elements[0], Title)
 								    assert elements[0].text.startswith("LayoutParser")
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: add metadata tracking to document elements (#225)

* add metadata field to elements

* metadata tracking for pdf/image

* metadata for html

* update expected outputs

* metadata for the rest of the document types

* take out file metadata for now

* add url to tables

* added metadata to test_auto

* bump version

* added coordinates to __init__

* fix coordinates in tests
											
										
										
											2023-02-15 13:26:20 -05:00
-												build(deps): update inference version (#662)

Updated to the the latest version of unstructured-inference. detectron2 now gets implemented with onnxruntime, yay!

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
											
										
										
											2023-05-31 13:50:15 -05:00
+								    # NOTE(alan): Xfail since new model skips the word Zejiang
 								    request.applymarker(pytest.mark.xfail)
 								    assert isinstance(elements[1], NarrativeText)
 								    assert elements[1].text.startswith("Zejiang Shen")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												feat: extract tables (#503)

Exposes table extraction through partition and partition_pdf.
											
										
										
											2023-04-21 12:01:29 -05:00
+								def test_auto_partition_pdf_uses_table_extraction():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
 								    with patch(
 								        "unstructured_inference.inference.layout.process_file_with_model",
 								    ) as mock_process_file_with_model:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        partition(filename, pdf_infer_table_structure=True, strategy="hi_res")
-												feat: extract tables (#503)

Exposes table extraction through partition and partition_pdf.
											
										
										
											2023-04-21 12:01:29 -05:00
+								        assert mock_process_file_with_model.call_args[1]["extract_tables"]
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								def test_auto_partition_pdf_with_fast_strategy():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
 								    mock_return = [NarrativeText("Hello there!")]
 								    with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
 								        partition(filename=filename, strategy="fast")
 								    mock_partition.assert_called_once_with(
 								        filename=filename,
 								        file=None,
 								        url=None,
 								        include_page_breaks=False,
-												chore: change table param name (#513)

Updated parameter names that controls whether we try to infer table structure.
											
										
										
											2023-04-21 13:48:19 -05:00
+								        infer_table_structure=False,
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								        strategy="fast",
-												feat: allow users to pass OCR language into `partition` (#509)

* pip-compile new reqs

* bump inference version

* add language to pdf and image calls

* tests for passing in language

* version bump and changelog

* update docs

* pass ocr_languages in auto

* updated test fixtures

* typo in doc string
											
										
										
											2023-04-21 09:41:26 -04:00
+								        ocr_languages="eng",
-												feat: add `"fast"` strategy for PDF parsing; fallback to `"fast"` if `detectron2` is not available (#357)

Adds a "fast" strategy for partitioning PDFs that uses pdfminer. The default strategy is "hi_res" and is the original partitioning logic that uses detectron2. If detectron2 is not available and the "hi_res" strategy is selected, partition_pdf fallsback to using the "fast" strategy. The implementation uses pdfminer because that's already installed as a dependency with the local-inference extra. There are other options for accomplishing this as well, but they would entail adding a new dependency. The "fast" strategy substantially speeds up processing.
											
										
										
											2023-03-10 22:16:05 -05:00
+								    )
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
 								    ("pass_file_filename", "content_type"),
 								    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
 								)
-												build(deps): update inference version (#662)

Updated to the the latest version of unstructured-inference. detectron2 now gets implemented with onnxruntime, yay!

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
											
										
										
											2023-05-31 13:50:15 -05:00
+								def test_auto_partition_pdf_from_file(pass_file_filename, content_type, request):
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    file_filename = filename if pass_file_filename else None
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(
 								            file=f,
 								            file_filename=file_filename,
 								            content_type=content_type,
 								            strategy="hi_res",
 								        )
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
 								    assert isinstance(elements[0], Title)
 								    assert elements[0].text.startswith("LayoutParser")
-												build(deps): update inference version (#662)

Updated to the the latest version of unstructured-inference. detectron2 now gets implemented with onnxruntime, yay!

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
											
										
										
											2023-05-31 13:50:15 -05:00
+								    # NOTE(alan): Xfail since new model misses the first word Zejiang
 								    request.applymarker(pytest.mark.xfail)
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
+								    assert isinstance(elements[1], NarrativeText)
-												build(deps): bump requirements (#414)


											
										
										
											2023-04-04 19:59:06 -07:00
+								    assert elements[1].text.startswith("Zejiang Shen")
-												fix: no `UserWarning` when `partition_pdf` is called (#179)


											
										
										
											2023-01-27 12:08:18 -05:00
 								def test_partition_pdf_doesnt_raise_warning():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
 								    # NOTE(robinson): This is the recommended way to check that no warning is emitted,
 								    # per the pytest docs.
 								    # ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
 								    #      #additional-use-cases-of-warnings-in-tests
 								    with warnings.catch_warnings():
 								        warnings.simplefilter("error")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        partition(filename=filename, strategy="hi_res")
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
 								    ("pass_file_filename", "content_type"),
 								    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 								)
 								def test_auto_partition_jpg(pass_file_filename, content_type):
-												enhancement: auto strategy for PDFs and images (#578)

* added functions for determining auto stratgy

* change default strategy to auto

* tests for auto strategy

* update docs

* changelog and version

* bump version

* remove ingest file in wrong location

* update jpg output

* typo fix
											
										
										
											2023-05-12 13:45:08 -04:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    file_filename = filename if pass_file_filename else None
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(
 								        filename=filename,
 								        file_filename=file_filename,
 								        content_type=content_type,
-												Fix: Pass `strategy` parameter down from `partition` for `partition_image` (#708)

* changelog and version

* passing param down

* test should be auto

* doc nit

* lint

* update image output
											
										
										
											2023-06-09 13:54:18 -04:00
+								        strategy="auto",
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    )
-												feat: partition image (#144)

Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
											
										
										
											2023-01-13 22:24:13 -06:00
+								    assert len(elements) > 0
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								@pytest.mark.parametrize(
 								    ("pass_file_filename", "content_type"),
 								    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 								)
 								def test_auto_partition_jpg_from_file(pass_file_filename, content_type):
-												enhancement: auto strategy for PDFs and images (#578)

* added functions for determining auto stratgy

* change default strategy to auto

* tests for auto strategy

* update docs

* changelog and version

* bump version

* remove ingest file in wrong location

* update jpg output

* typo fix
											
										
										
											2023-05-12 13:45:08 -04:00
+								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
-												Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-24 16:32:45 -07:00
+								    file_filename = filename if pass_file_filename else None
-												feat: partition image (#144)

Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
											
										
										
											2023-01-13 22:24:13 -06:00
+								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(
 								            file=f,
 								            file_filename=file_filename,
 								            content_type=content_type,
-												Fix: Pass `strategy` parameter down from `partition` for `partition_image` (#708)

* changelog and version

* passing param down

* test should be auto

* doc nit

* lint

* update image output
											
										
										
											2023-06-09 13:54:18 -04:00
+								            strategy="auto",
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        )
-												feat: partition image (#144)

Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
											
										
										
											2023-01-13 22:24:13 -06:00
+								    assert len(elements) > 0
-												feat: generic `partition` brick with filetype detection (#132)

* add python-magic

* first pass on filetype detection

* tests for filetype detection

* more tests for file detection

* added tests for error conditions

* install libmagic dev in github

* libmagic install instructions

* pattern for checking email files

* support reading .eml in rb mode

* add auto partition function

* auto tests for emal

* auto tests for docx

* added tests for html

* add pdf and html tests

* linting, linting, linting

* added docs for auto partitioning

* update readme with generic partition brick

* bumped version

* added test for bad type

* detect .docx files from application/octet-stream

* linting, linting, linting

* identify xlsx from octet stream

* install poppler in ci

* fix mocks; test for unknown type

* install poppler utils

* install in one line

* only poppler-utils

* file extension logic from application/octet-stream

* install local inference for ci

* install detectron2

* removing unused dockerfile
											
										
										
											2023-01-09 16:15:14 -05:00
+								def test_auto_partition_raises_with_bad_type(monkeypatch):
 								    monkeypatch.setattr(auto, "detect_filetype", lambda *args, **kwargs: None)
 								    with pytest.raises(ValueError):
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        partition(filename="made-up.fake", strategy="hi_res")
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
 								EXPECTED_PPTX_OUTPUT = [
 								    Title(text="Adding a Bullet Slide"),
 								    ListItem(text="Find the bullet slide layout"),
 								    ListItem(text="Use _TextFrame.text for first bullet"),
 								    ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
 								    NarrativeText(text="Here is a lot of text!"),
 								    NarrativeText(text="Here is some text in a text box!"),
 								]
 								def test_auto_partition_pptx_from_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: basic PowerPoint parsing in `partition_pptx` (#166)

* parition pptx and tests

* add parition_pptx to auto

* update doc types in readme

* add pptx docs

* bump version

* remove extra whitespace

* partition -> partitioning
											
										
										
											2023-01-23 12:03:09 -05:00
+								    assert elements == EXPECTED_PPTX_OUTPUT
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: optional page breaks for `.pptx`, `.pdf`, `.html` and images (#205)

* page breaks for pptx

* added page breaks for image/pdf

* tests for images with page breaks

* page breaks for html documents

* linting, linting, linting

* changelog and bump version

* update docs

* fix typo

* refactor reusable code to common.py

* add type back in
											
										
										
											2023-02-08 10:11:15 -05:00
-												fix: correct order of kwargs in pandoc (#421)

* fix: correct order of kwargs in pandoc

* only skip epub tests in Docker

* changelog

---------

Co-authored-by: Crag Wolfe <crag@unstructuredai.io>
Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-03-30 16:54:29 -04:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-												feat: add `partition_ppt` for older power point docs (#238)

* added partition_ppt function and tests

* add ppt support to auto

* version bump

* update docs

* doc fixes

* update changelog

* `.docx` -> `.pptx`

* its -> their

* remove whitespace
											
										
										
											2023-02-17 11:57:08 -05:00
+								def test_auto_partition_ppt_from_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: add `partition_ppt` for older power point docs (#238)

* added partition_ppt function and tests

* add ppt support to auto

* version bump

* update docs

* doc fixes

* update changelog

* `.docx` -> `.pptx`

* its -> their

* remove whitespace
											
										
										
											2023-02-17 11:57:08 -05:00
+								    assert elements == EXPECTED_PPTX_OUTPUT
-												enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
											
										
										
											2023-05-12 11:33:01 -04:00
+								    assert elements[0].metadata.filename == os.path.basename(filename)
-												enhancement: add `file_directory` to element metadata (#585)

* enhancement: add `file_directory` to element metadata

* update msg test

* exclude file_directory

* update slack output

* added file directory tests on partition_x paths
											
										
										
											2023-05-15 18:25:39 -04:00
+								    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
-												feat: add `partition_ppt` for older power point docs (#238)

* added partition_ppt function and tests

* add ppt support to auto

* version bump

* update docs

* doc fixes

* update changelog

* `.docx` -> `.pptx`

* its -> their

* remove whitespace
											
										
										
											2023-02-17 11:57:08 -05:00
-												feat: optional page breaks for `.pptx`, `.pdf`, `.html` and images (#205)

* page breaks for pptx

* added page breaks for image/pdf

* tests for images with page breaks

* page breaks for html documents

* linting, linting, linting

* changelog and bump version

* update docs

* fix typo

* refactor reusable code to common.py

* add type back in
											
										
										
											2023-02-08 10:11:15 -05:00
+								def test_auto_with_page_breaks():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, include_page_breaks=True, strategy="hi_res")
-												feat: optional page breaks for `.pptx`, `.pdf`, `.html` and images (#205)

* page breaks for pptx

* added page breaks for image/pdf

* tests for images with page breaks

* page breaks for html documents

* linting, linting, linting

* changelog and bump version

* update docs

* fix typo

* refactor reusable code to common.py

* add type back in
											
										
										
											2023-02-08 10:11:15 -05:00
+								    assert PageBreak() in elements
-												feat: add `partition_epub` function (#364)

* add pypandoc dependency

* added epub partitioner and file conversion

* test for partition_epub

* tests for file conversion

* add epub to filetype detection

* added epub to auto partition

* update bricks docs

* updated installing docs

* changelot and version

* add pandoc to dependencies

* add pandoc to debian dependencies

* linting, linting, linting

* typo fix

* typo fix

* file conversion type hints

* more type hints

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-03-14 11:52:21 -04:00
 								def test_auto_partition_epub_from_filename():
 								    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: add `partition_epub` function (#364)

* add pypandoc dependency

* added epub partitioner and file conversion

* test for partition_epub

* tests for file conversion

* add epub to filetype detection

* added epub to auto partition

* update bricks docs

* updated installing docs

* changelot and version

* add pandoc to dependencies

* add pandoc to debian dependencies

* linting, linting, linting

* typo fix

* typo fix

* file conversion type hints

* more type hints

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-03-14 11:52:21 -04:00
+								    assert len(elements) > 0
 								    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
 								def test_auto_partition_epub_from_file():
 								    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
 								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: add `partition_epub` function (#364)

* add pypandoc dependency

* added epub partitioner and file conversion

* test for partition_epub

* tests for file conversion

* add epub to filetype detection

* added epub to auto partition

* update bricks docs

* updated installing docs

* changelot and version

* add pandoc to dependencies

* add pandoc to debian dependencies

* linting, linting, linting

* typo fix

* typo fix

* file conversion type hints

* more type hints

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-03-14 11:52:21 -04:00
+								    assert len(elements) > 0
 								    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
-												feat: add `partition_msg` for MSFT Outlook files (#412)

* added msg-parser dependency

* pass through kwargs in convert_file_to_text

* added partition_msg for processing msft outlook files

* version bump and changelog

* added tests for partition_msg

* added test for msg with plain text

* add partition_msg docs; fix underlines in integration docs

* add .msg to file list

* finish tests for auto msg

* linting, linting, linting
											
										
										
											2023-03-28 16:15:22 -04:00
 								EXPECTED_MSG_OUTPUT = [
 								    NarrativeText(text="This is a test email to use for unit tests."),
 								    Title(text="Important points:"),
 								    ListItem(text="Roses are red"),
 								    ListItem(text="Violets are blue"),
 								]
 								def test_auto_partition_msg_from_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: add `partition_msg` for MSFT Outlook files (#412)

* added msg-parser dependency

* pass through kwargs in convert_file_to_text

* added partition_msg for processing msft outlook files

* version bump and changelog

* added tests for partition_msg

* added test for msg with plain text

* add partition_msg docs; fix underlines in integration docs

* add .msg to file list

* finish tests for auto msg

* linting, linting, linting
											
										
										
											2023-03-28 16:15:22 -04:00
+								    assert elements == EXPECTED_MSG_OUTPUT
-												feat: add `partition_rtf` for rich text files (#466)

* refactor epub; add rtf

* added test for rtf files

* filetype detection for rtf files

* add rtf to auto

* update docs for group_broken_paragraphs

* add rtf to docs

* update file list in readme

* update stage_for_transformers docs

* changelog and version bump

* skip rtf if in docker

* skip test if rtf not supported

* docs tweaks
											
										
										
											2023-04-10 17:25:03 -04:00
 								def test_auto_partition_rtf_from_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: add `partition_rtf` for rich text files (#466)

* refactor epub; add rtf

* added test for rtf files

* filetype detection for rtf files

* add rtf to auto

* update docs for group_broken_paragraphs

* add rtf to docs

* update file list in readme

* update stage_for_transformers docs

* changelog and version bump

* skip rtf if in docker

* skip test if rtf not supported

* docs tweaks
											
										
										
											2023-04-10 17:25:03 -04:00
+								    assert elements[0] == Title("My First Heading")
-												feat: add `url` kwarg to `partititon` (#470)

* added url option to auto partition

* add test for partition from url

* version and changelog

* update docs

* add url to element metadata
											
										
										
											2023-04-12 14:31:01 -04:00
 								def test_auto_partition_from_url():
 								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(url=url, content_type="text/plain", strategy="hi_res")
-												feat: add `url` kwarg to `partititon` (#470)

* added url option to auto partition

* add test for partition from url

* version and changelog

* update docs

* add url to element metadata
											
										
										
											2023-04-12 14:31:01 -04:00
+								    assert elements[0] == Title("Apache License")
 								    assert elements[0].metadata.url == url
-												feat: allow headers in `partition` (#473)

* feat: allow headers in `partition`

* warning if header is set and url is not

* update emoji test
											
										
										
											2023-04-13 11:04:15 -04:00
-												fix: updates markdown code to process markdown with embedded html (#480)

* add carriage return to html if missing

* test on markdown with embedded html

* changelog and version

* check for html parser

* linting, linting, linting
											
										
										
											2023-04-13 12:47:45 -04:00
+								def test_partition_md_works_with_embedded_html():
 								    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(url=url, content_type="text/markdown", strategy="hi_res")
-												fix: updates markdown code to process markdown with embedded html (#480)

* add carriage return to html if missing

* test on markdown with embedded html

* changelog and version

* check for html parser

* linting, linting, linting
											
										
										
											2023-04-13 12:47:45 -04:00
+								    elements[0].text
 								    unstructured_found = False
 								    for element in elements:
 								        if "unstructured" in elements[0].text:
 								            unstructured_found = True
 								            break
 								    assert unstructured_found is True
-												feat: allow headers in `partition` (#473)

* feat: allow headers in `partition`

* warning if header is set and url is not

* update emoji test
											
										
										
											2023-04-13 11:04:15 -04:00
+								def test_auto_partition_warns_if_header_set_and_not_url(caplog):
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    partition(filename=filename, headers={"Accept": "application/pdf"}, strategy="hi_res")
-												feat: allow headers in `partition` (#473)

* feat: allow headers in `partition`

* warning if header is set and url is not

* update emoji test
											
										
										
											2023-04-13 11:04:15 -04:00
+								    assert caplog.records[0].levelname == "WARNING"
-												fix: update `detect_filetype` for JSONs with text/plain MIME type (#520)

* check to see if text file is a json

* add json check into filetype detection

* added test for updated file detection logic

* bytes/strings handling

* changlog and version bump
											
										
										
											2023-04-26 13:52:47 -04:00
 								def test_auto_partition_works_with_unstructured_jsons():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												fix: update `detect_filetype` for JSONs with text/plain MIME type (#520)

* check to see if text file is a json

* add json check into filetype detection

* added test for updated file detection logic

* bytes/strings handling

* changlog and version bump
											
										
										
											2023-04-26 13:52:47 -04:00
+								    assert elements[0].text == "News Around NOAA"
 								def test_auto_partition_works_with_unstructured_jsons_from_file():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
 								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												fix: update `detect_filetype` for JSONs with text/plain MIME type (#520)

* check to see if text file is a json

* add json check into filetype detection

* added test for updated file detection logic

* bytes/strings handling

* changlog and version bump
											
										
										
											2023-04-26 13:52:47 -04:00
+								    assert elements[0].text == "News Around NOAA"
-												feat: add `partition_odt` for open office docs (#548)

* added filetype detection for odt

* add function for partition odt documents

* add odt files to auto

* changelog and version

* docs and readme

* update installation docs

* skip tests if not supported or in docker

* import pytest

* fix docs typos
											
										
										
											2023-05-04 15:28:08 -04:00
 								def test_auto_partition_odt_from_filename():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								    elements = partition(filename=filename, strategy="hi_res")
-												feat: add `partition_odt` for open office docs (#548)

* added filetype detection for odt

* add function for partition odt documents

* add odt files to auto

* changelog and version

* docs and readme

* update installation docs

* skip tests if not supported or in docker

* import pytest

* fix docs typos
											
										
										
											2023-05-04 15:28:08 -04:00
+								    assert elements == [Title("Lorem ipsum dolor sit amet.")]
 								def test_auto_partition_odt_from_file():
 								    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
 								    with open(filename, "rb") as f:
-												Chore(ingest): Add `--partition-strategy` parameter in  CLI (#582)

* change strategy arg defalut to auto in partition

* passing --partition-strategy down

* add strategy="hi_res" to test (default changed)

* made an error on param name, added note

											
										
										
											2023-05-15 15:26:53 -04:00
+								        elements = partition(file=f, strategy="hi_res")
-												feat: add `partition_odt` for open office docs (#548)

* added filetype detection for odt

* add function for partition odt documents

* add odt files to auto

* changelog and version

* docs and readme

* update installation docs

* skip tests if not supported or in docker

* import pytest

* fix docs typos
											
										
										
											2023-05-04 15:28:08 -04:00
 								    assert elements == [Title("Lorem ipsum dolor sit amet.")]
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
 								@pytest.mark.parametrize(
 								    ("content_type", "routing_func", "expected"),
 								    [
 								        ("application/json", "json", "application/json"),
 								        ("text/html", "html", "text/html"),
 								        ("jdsfjdfsjkds", "pdf", None),
 								    ],
 								)
 								def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected):
 								    with patch(
 								        f"unstructured.partition.auto.partition_{routing_func}",
 								        lambda *args, **kwargs: [Text("text 1"), Text("text 2")],
 								    ):
 								        elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
 								    assert len(elements) == 2
 								    assert all(el.metadata.filetype == expected for el in elements)
 								@pytest.mark.parametrize(
 								    ("content_type", "expected"),
 								    [
 								        ("application/pdf", FILETYPE_TO_MIMETYPE[FileType.PDF]),
 								        (None, FILETYPE_TO_MIMETYPE[FileType.PDF]),
 								    ],
 								)
 								def test_auto_filetype_overrides_file_specific(content_type, expected):
 								    pdf_metadata = ElementMetadata(filetype="imapdf")
 								    with patch(
 								        "unstructured.partition.auto.partition_pdf",
 								        lambda *args, **kwargs: [
 								            Text("text 1", metadata=pdf_metadata),
 								            Text("text 2", metadata=pdf_metadata),
 								        ],
 								    ):
 								        elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
 								    assert len(elements) == 2
 								    assert all(el.metadata.filetype == expected for el in elements)
 								supported_filetypes = [
 								    _
 								    for _ in FileType
 								    if _
 								    not in (
 								        FileType.UNK,
 								        FileType.ZIP,
 								        FileType.XLS,
 								    )
 								]
 								FILETYPE_TO_MODULE = {
 								    FileType.JPG: "image",
 								    FileType.PNG: "image",
 								    FileType.TXT: "text",
 								    FileType.EML: "email",
 								}
 								@pytest.mark.parametrize("filetype", supported_filetypes)
 								def test_file_specific_produces_correct_filetype(filetype: FileType):
-												enhancement: handling for empty files in `detect_filetype` and `partition` (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
											
										
										
											2023-06-09 16:07:50 -04:00
+								    if filetype in (FileType.JPG, FileType.PNG, FileType.EMPTY):
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								        pytest.skip()
 								    extension = filetype.name.lower()
 								    filetype_module = (
 								        extension if filetype not in FILETYPE_TO_MODULE else FILETYPE_TO_MODULE[filetype]
 								    )
 								    fun_name = "partition_" + filetype_module
 								    module = import_module(f"unstructured.partition.{filetype_module}")  # noqa
 								    fun = eval(f"module.{fun_name}")
 								    for file in pathlib.Path("example-docs").iterdir():
 								        if file.is_file() and file.suffix == f".{extension}":
 								            elements = fun(str(file))
-												enhancement: handling for empty files in `detect_filetype` and `partition` (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
											
										
										
											2023-06-09 16:07:50 -04:00
+								            assert all(
 								                el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype]
 								                for el in elements
 								                if el.metadata.filetype is not None
 								            )
-												enhancement: filetype in metadata (#583)

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
											
										
										
											2023-05-15 13:23:19 -05:00
+								            break
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
-												feat: add `partition_xml` for XML files (#596)

* first pass on partition_xml

* add option to keep xml tags

* added tests for xml

* fix filename

* update filenames

* remove outdated readme

* add xml to auto

* version and changelog

* update readme and docs

* pass through include_metadata

* update include_metadata description

* add README back in

* linting, linting, linting

* more linting

* spooled to bytes doesnt need to be a tuple

* Add tests for newly supported filetypes

* Correct metadata filetype

* doc typo

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* typo fix

Co-authored-by: qued <64741807+qued@users.noreply.github.com>

* keep_xml_tags -> xml_keep_tags

---------

Co-authored-by: Alan Bertl <alan@unstructured.io>
Co-authored-by: qued <64741807+qued@users.noreply.github.com>
											
										
										
											2023-05-18 11:40:12 -04:00
+								def test_auto_partition_xml_from_filename(filename="example-docs/factbook.xml"):
 								    elements = partition(filename=filename, xml_keep_tags=False)
 								    assert elements[0].text == "United States"
 								    assert elements[0].metadata.filename == "factbook.xml"
 								def test_auto_partition_xml_from_file(filename="example-docs/factbook.xml"):
 								    with open(filename, "rb") as f:
 								        elements = partition(file=f, xml_keep_tags=False)
 								    assert elements[0].text == "United States"
 								def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
 								    elements = partition(filename=filename, xml_keep_tags=True)
 								    assert elements[5].text == "<name>United States</name>"
 								    assert elements[5].metadata.filename == "factbook.xml"
 								def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.xml"):
 								    with open(filename, "rb") as f:
 								        elements = partition(file=f, xml_keep_tags=True)
 								    assert elements[5].text == "<name>United States</name>"
-												feat: add `partition_xlsx` for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
											
										
										
											2023-05-16 15:40:40 -04:00
+								EXPECTED_XLSX_TABLE = """<table border="1" class="dataframe">
 								  <tbody>
 								    <tr>
 								      <td>Team</td>
 								      <td>Location</td>
 								      <td>Stanley Cups</td>
 								    </tr>
 								    <tr>
 								      <td>Blues</td>
 								      <td>STL</td>
 								      <td>1</td>
 								    </tr>
 								    <tr>
 								      <td>Flyers</td>
 								      <td>PHI</td>
 								      <td>2</td>
 								    </tr>
 								    <tr>
 								      <td>Maple Leafs</td>
 								      <td>TOR</td>
 								      <td>13</td>
 								    </tr>
 								  </tbody>
 								</table>"""
 								EXPECTED_XLSX_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
 								EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 								def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
 								    elements = partition(filename=filename)
 								    assert all(isinstance(element, Table) for element in elements)
 								    assert len(elements) == 2
 								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
 								    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
 								    assert elements[0].metadata.page_number == 1
 								    assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
 								def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
 								    with open(filename, "rb") as f:
 								        elements = partition(file=f)
 								    assert all(isinstance(element, Table) for element in elements)
 								    assert len(elements) == 2
 								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
 								    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
 								    assert elements[0].metadata.page_number == 1
 								    assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
-												feat: add xls support (#632)

Add support for older .XLS files from the partition function in unstructured.partition.auto.

Note, this should also work on the centos7 unstructured image (with the requirements/*txt updates in this PR).
											
										
										
											2023-05-26 01:55:32 -07:00
+								EXPECTED_XLS_TEXT_LEN = 883
 								EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MA What C datatypes are 8 bits? (assume i386)"
 								EXPECTED_XLS_TABLE = (
 								    """<table border="1" class="dataframe">
 								  <tbody>
 								    <tr>
 								      <td>MA</td>
 								      <td>What C datatypes are 8 bits? (assume i386)</td>
 								      <td>int</td>
 								      <td></td>
 								      <td>float</td>
 								      <td></td>
 								      <td>double</td>
 								      <td></td>
 								      <td>char</td>
 								    </tr>
 								    <tr>
 								      <td>TF</td>
 								      <td>Bagpipes are awesome.</td>
 								      <td>true</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>ESS</td>
 								      <td>How have the original Henry Hornbostel buildings """
 								    """influenced campus architecture and design in the last 30 years?</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>ORD</td>
 								      <td>Rank the following in their order of operation.</td>
 								      <td>Parentheses</td>
 								      <td>Exponents</td>
 								      <td>Division</td>
 								      <td>Addition</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>FIB</td>
 								      <td>The student activities fee is</td>
 								      <td>95</td>
 								      <td>dollars for students enrolled in</td>
 								      <td>19</td>
 								      <td>units or more,</td>
 								      <td></td>
 								      <td></td>
 								      <td></td>
 								    </tr>
 								    <tr>
 								      <td>MAT</td>
 								      <td>Match the lower-case greek letter with its capital form.</td>
 								      <td>λ</td>
 								      <td>Λ</td>
 								      <td>α</td>
 								      <td>γ</td>
 								      <td>Γ</td>
 								      <td>φ</td>
 								      <td>Φ</td>
 								    </tr>
 								  </tbody>
 								</table>"""
 								)
 								def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
 								    elements = partition(filename=filename)
 								    assert all(isinstance(element, Table) for element in elements)
 								    assert len(elements) == 3
 								    assert clean_extra_whitespace(elements[0].text)[:45] == EXPECTED_XLS_INITIAL_45_CLEAN_TEXT
 								    assert len(elements[0].text) == EXPECTED_XLS_TEXT_LEN
 								    assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE
-												fix: add more mime types for csv (#620)


											
										
										
											2023-05-19 17:40:26 -04:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
+								def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
 								    elements = partition(filename=filename)
 								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
 								    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
 								    assert elements[0].metadata.filetype == "text/csv"
-												fix: add more mime types for csv (#620)


											
										
										
											2023-05-19 17:40:26 -04:00
+								@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-												feat: add `partition_csv` function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
											
										
										
											2023-05-19 15:57:42 -04:00
+								def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
 								    with open(filename, "rb") as f:
 								        elements = partition(file=f)
 								    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
 								    assert isinstance(elements[0], Table)
 								    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
 								    assert elements[0].metadata.filetype == "text/csv"
-												enhancement: handling for empty files in `detect_filetype` and `partition` (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
											
										
										
											2023-06-09 16:07:50 -04:00
 								def test_auto_partition_works_on_empty_filename(filename="example-docs/empty.txt"):
 								    assert partition(filename=filename) == []
 								def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
 								    with open(filename, "rb") as f:
 								        assert partition(file=f) == []