mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 15:13:35 +00:00
Adding content_type and file_filename to autopartition (#394)
Co-authored-by: cragwolfe <crag@unstructured.io>
This commit is contained in:
parent
8ffd31029e
commit
71e035c34c
@ -1,9 +1,10 @@
|
||||
## 0.5.7-dev3
|
||||
## 0.5.7
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Refactored codebase using `exactly_one`
|
||||
* Adds ability to pass headers when passing a url in partition_html()
|
||||
* Added optional `content_type` and `file_filename` parameters to `partition()` to bypass file detection
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
@ -30,6 +30,7 @@ In cases where ``libmagic`` is not available, filetype detection will fall back
|
||||
As shown in the examples below, the ``partition`` function accepts both filenames and file-like objects as input.
|
||||
``partition`` also has some optional kwargs.
|
||||
For example, if you set ``include_page_breaks=True``, the output will include ``PageBreak`` elements if the filetype supports it.
|
||||
Additionally you can bypass the filetype detection logic with the optional ``content_type`` argument which may be specified with either the ``filename`` or file-like object, ``file``.
|
||||
You can find a full listing of optional kwargs in the documentation below.
|
||||
|
||||
.. code:: python
|
||||
@ -38,7 +39,7 @@ You can find a full listing of optional kwargs in the documentation below.
|
||||
|
||||
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
elements = partition(filename=filename)
|
||||
elements = partition(filename=filename, content_type="application/pdf")
|
||||
print("\n\n".join([str(el) for el in elements][:10]))
|
||||
|
||||
|
||||
@ -57,7 +58,7 @@ The ``unstructured`` library also includes partitioning bricks targeted at speci
|
||||
The ``partition`` brick uses these document-specific partitioning bricks under the hood.
|
||||
There are a few reasons you may want to use a document-specific partitioning brick instead of ``partition``:
|
||||
|
||||
* If you already know the document type, filetype detection is unnecessary. Using the document-specific brick directly will make your program run faster.
|
||||
* If you already know the document type, filetype detection is unnecessary. Using the document-specific brick directly, or passing in the ``content_type`` will make your program run faster.
|
||||
* Fewer dependencies. You don't need to install ``libmagic`` for filetype detection if you're only using document-specific bricks.
|
||||
* Additional features. The API for partition is the least common denominator for all document types. Certain document-specific brick include extra features that you may want to take advantage of. For example, ``partition_html`` allows you to pass in a URL so you don't have to store the ``.html`` file locally. See the documentation below learn about the options available in each partitioning brick.
|
||||
|
||||
|
||||
@ -105,13 +105,27 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
|
||||
assert elements == expected_docx_elements
|
||||
|
||||
|
||||
def test_auto_partition_doc_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_doc_with_filename(
|
||||
mock_docx_document,
|
||||
expected_docx_elements,
|
||||
tmpdir,
|
||||
pass_file_filename,
|
||||
content_type,
|
||||
):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||
mock_docx_document.save(docx_filename)
|
||||
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
||||
|
||||
elements = partition(filename=doc_filename)
|
||||
file_filename = doc_filename if pass_file_filename else None
|
||||
elements = partition(
|
||||
filename=doc_filename,
|
||||
file_filename=file_filename,
|
||||
content_type=content_type,
|
||||
)
|
||||
assert elements == expected_docx_elements
|
||||
assert elements[0].metadata.filename == doc_filename
|
||||
|
||||
@ -130,17 +144,27 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
|
||||
assert elements == expected_docx_elements
|
||||
|
||||
|
||||
def test_auto_partition_html_from_filename():
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_html_from_filename(pass_file_filename, content_type):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
|
||||
elements = partition(filename=filename)
|
||||
file_filename = filename if pass_file_filename else None
|
||||
elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
|
||||
assert len(elements) > 0
|
||||
assert elements[0].metadata.filename == filename
|
||||
|
||||
|
||||
def test_auto_partition_html_from_file():
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_html_from_file(pass_file_filename, content_type):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
|
||||
file_filename = filename if pass_file_filename else None
|
||||
with open(filename) as f:
|
||||
elements = partition(file=f)
|
||||
elements = partition(file=f, file_filename=file_filename, content_type=content_type)
|
||||
assert len(elements) > 0
|
||||
|
||||
|
||||
@ -177,9 +201,15 @@ def test_auto_partition_text_from_file():
|
||||
assert elements == EXPECTED_TEXT_OUTPUT
|
||||
|
||||
|
||||
def test_auto_partition_pdf_from_filename():
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_pdf_from_filename(pass_file_filename, content_type):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
elements = partition(filename=filename)
|
||||
file_filename = filename if pass_file_filename else None
|
||||
|
||||
elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
|
||||
|
||||
assert isinstance(elements[0], Title)
|
||||
assert elements[0].text.startswith("LayoutParser")
|
||||
@ -207,10 +237,16 @@ def test_auto_partition_pdf_with_fast_strategy():
|
||||
)
|
||||
|
||||
|
||||
def test_auto_partition_pdf_from_file():
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_pdf_from_file(pass_file_filename, content_type):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
file_filename = filename if pass_file_filename else None
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(file=f)
|
||||
elements = partition(file=f, file_filename=file_filename, content_type=content_type)
|
||||
|
||||
assert isinstance(elements[0], Title)
|
||||
assert elements[0].text.startswith("LayoutParser")
|
||||
@ -230,16 +266,26 @@ def test_partition_pdf_doesnt_raise_warning():
|
||||
partition(filename=filename)
|
||||
|
||||
|
||||
def test_auto_partition_jpg():
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_jpg(pass_file_filename, content_type):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example.jpg")
|
||||
elements = partition(filename=filename)
|
||||
file_filename = filename if pass_file_filename else None
|
||||
elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
|
||||
assert len(elements) > 0
|
||||
|
||||
|
||||
def test_auto_partition_jpg_from_file():
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_jpg_from_file(pass_file_filename, content_type):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example.jpg")
|
||||
file_filename = filename if pass_file_filename else None
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(file=f)
|
||||
elements = partition(file=f, file_filename=file_filename, content_type=content_type)
|
||||
assert len(elements) > 0
|
||||
|
||||
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.5.7-dev3" # pragma: no cover
|
||||
__version__ = "0.5.7" # pragma: no cover
|
||||
|
||||
@ -111,6 +111,24 @@ class FileType(Enum):
|
||||
return self.name < other.name
|
||||
|
||||
|
||||
STR_TO_FILETYPE = {
|
||||
"application/pdf": FileType.PDF,
|
||||
"application/msword": FileType.DOC,
|
||||
"image/jpeg": FileType.JPG,
|
||||
"image/png": FileType.PNG,
|
||||
"text/markdown": FileType.MD,
|
||||
"text/x-markdown": FileType.MD,
|
||||
"application/epub": FileType.EPUB,
|
||||
"application/epub+zip": FileType.EPUB,
|
||||
"text/html": FileType.HTML,
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
|
||||
"application/vnd.ms-excel": FileType.XLS,
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
|
||||
"application/vnd.ms-powerpoint": FileType.PPT,
|
||||
"application/xml": FileType.XML,
|
||||
}
|
||||
|
||||
|
||||
EXT_TO_FILETYPE = {
|
||||
".pdf": FileType.PDF,
|
||||
".docx": FileType.DOCX,
|
||||
@ -138,18 +156,26 @@ EXT_TO_FILETYPE = {
|
||||
|
||||
def detect_filetype(
|
||||
filename: Optional[str] = None,
|
||||
content_type: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
file_filename: Optional[str] = None,
|
||||
) -> Optional[FileType]:
|
||||
"""Use libmagic to determine a file's type. Helps determine which partition brick
|
||||
to use for a given file. A return value of None indicates a non-supported file type."""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
if filename:
|
||||
_, extension = os.path.splitext(filename)
|
||||
if content_type:
|
||||
filetype = STR_TO_FILETYPE.get(content_type)
|
||||
if filetype:
|
||||
return filetype
|
||||
|
||||
if filename or file_filename:
|
||||
_, extension = os.path.splitext(filename or file_filename or "")
|
||||
extension = extension.lower()
|
||||
if LIBMAGIC_AVAILABLE:
|
||||
mime_type = magic.from_file(filename, mime=True)
|
||||
mime_type = magic.from_file(filename or file_filename, mime=True) # type: ignore
|
||||
else:
|
||||
# might not need this
|
||||
return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
|
||||
elif file is not None:
|
||||
extension = None
|
||||
@ -164,6 +190,8 @@ def detect_filetype(
|
||||
"Filetype detection on file-like objects requires libmagic. "
|
||||
"Please install libmagic and try again.",
|
||||
)
|
||||
else:
|
||||
raise ValueError("No filename, file, nor file_filename were specified.")
|
||||
|
||||
if mime_type == "application/pdf":
|
||||
return FileType.PDF
|
||||
|
||||
@ -17,7 +17,9 @@ from unstructured.partition.text import partition_text
|
||||
|
||||
def partition(
|
||||
filename: Optional[str] = None,
|
||||
content_type: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
file_filename: Optional[str] = None,
|
||||
include_page_breaks: bool = False,
|
||||
strategy: str = "hi_res",
|
||||
encoding: str = "utf-8",
|
||||
@ -31,8 +33,12 @@ def partition(
|
||||
----------
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
content_type
|
||||
A string defining the file content in MIME type
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
file_filename
|
||||
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
||||
include_page_breaks
|
||||
If True, the output will include page breaks if the filetype supports it
|
||||
strategy
|
||||
@ -42,7 +48,12 @@ def partition(
|
||||
encoding
|
||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||
"""
|
||||
filetype = detect_filetype(filename=filename, file=file)
|
||||
filetype = detect_filetype(
|
||||
filename=filename,
|
||||
file=file,
|
||||
file_filename=file_filename,
|
||||
content_type=content_type,
|
||||
)
|
||||
|
||||
if file is not None:
|
||||
file.seek(0)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user