feat: add url kwarg to partititon (#470)

* added url option to auto partition

* add test for partition from url

* version and changelog

* update docs

* add url to element metadata
This commit is contained in:
Matt Robinson 2023-04-12 14:31:01 -04:00 committed by GitHub
parent 2110a266c8
commit e2e473dddd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 97 additions and 24 deletions

View File

@ -1,4 +1,4 @@
## 0.5.12-dev5
## 0.5.12
### Enhancements
@ -10,6 +10,7 @@
* Add --partition-by-api parameter to unstructured-ingest
* Added `partition_rtf` for processing rich text files.
* `partition` now accepts a `url` kwarg in addition to `file` and `filename`.
### Fixes

View File

@ -116,6 +116,21 @@ faster processing and `"hi_res"` for
elements = partition(filename="example-docs/layout-parser-paper-fast.pdf")
The ``partition`` function also accepts a ``url`` kwarg for remotely hosted documents. If you want
to force ``partition`` to treat the document as a particular MIME type, use the ``content_type``
kwarg in conjunction with ``url``. Otherwise, ``partition`` will use the information from
the ``Content-Type`` header in the HTTP response.
.. code:: python
from unstructured.partition.auto import partition
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
elements = partition(url=url)
elements = partition(url=url, content_type="text/markdown")
``partition_docx``
------------------

View File

@ -367,3 +367,10 @@ def test_auto_partition_rtf_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
elements = partition(filename=filename)
assert elements[0] == Title("My First Heading")
def test_auto_partition_from_url():
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
elements = partition(url=url, content_type="text/plain")
assert elements[0] == Title("Apache License")
assert elements[0].metadata.url == url

View File

@ -1 +1 @@
__version__ = "0.5.12-dev5" # pragma: no cover
__version__ = "0.5.12" # pragma: no cover

View File

@ -1,6 +1,10 @@
from typing import IO, Callable, Optional
import io
from typing import IO, Callable, Optional, Tuple
import requests
from unstructured.file_utils.filetype import FileType, detect_filetype
from unstructured.partition.common import exactly_one
from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import partition_docx
from unstructured.partition.email import partition_email
@ -22,6 +26,7 @@ def partition(
content_type: Optional[str] = None,
file: Optional[IO] = None,
file_filename: Optional[str] = None,
url: Optional[str] = None,
include_page_breaks: bool = False,
strategy: str = "hi_res",
encoding: str = "utf-8",
@ -42,6 +47,9 @@ def partition(
A file-like object using "rb" mode --> open(filename, "rb").
file_filename
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
url
The url for a remote document. Pass in content_type if you want partition to treat
the document as a specific content_type.
include_page_breaks
If True, the output will include page breaks if the filetype supports it
strategy
@ -51,37 +59,50 @@ def partition(
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
"""
filetype = detect_filetype(
filename=filename,
file=file,
file_filename=file_filename,
content_type=content_type,
)
exactly_one(file=file, filename=filename, url=url)
if url is not None:
file, filetype = file_and_type_from_url(url=url, content_type=content_type)
else:
filetype = detect_filetype(
filename=filename,
file=file,
file_filename=file_filename,
content_type=content_type,
)
if file is not None:
file.seek(0)
if filetype == FileType.DOC:
return partition_doc(filename=filename, file=file)
if filetype == FileType.DOCX:
return partition_docx(filename=filename, file=file)
elements = partition_doc(filename=filename, file=file)
elif filetype == FileType.DOCX:
elements = partition_docx(filename=filename, file=file)
elif filetype == FileType.EML:
return partition_email(filename=filename, file=file, encoding=encoding)
elements = partition_email(filename=filename, file=file, encoding=encoding)
elif filetype == FileType.MSG:
return partition_msg(filename=filename, file=file)
elements = partition_msg(filename=filename, file=file)
elif filetype == FileType.HTML:
return partition_html(
elements = partition_html(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
encoding=encoding,
)
elif filetype == FileType.EPUB:
return partition_epub(filename=filename, file=file, include_page_breaks=include_page_breaks)
elements = partition_epub(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
)
elif filetype == FileType.MD:
return partition_md(filename=filename, file=file, include_page_breaks=include_page_breaks)
elements = partition_md(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
)
elif filetype == FileType.PDF:
return partition_pdf(
elements = partition_pdf(
filename=filename, # type: ignore
file=file, # type: ignore
url=None,
@ -90,27 +111,56 @@ def partition(
strategy=strategy,
)
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
return partition_image(
elements = partition_image(
filename=filename, # type: ignore
file=file, # type: ignore
url=None,
include_page_breaks=include_page_breaks,
)
elif filetype == FileType.TXT:
return partition_text(
elements = partition_text(
filename=filename,
file=file,
encoding=encoding,
paragraph_grouper=paragraph_grouper,
)
elif filetype == FileType.RTF:
return partition_rtf(filename=filename, file=file, include_page_breaks=include_page_breaks)
elements = partition_rtf(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
)
elif filetype == FileType.PPT:
return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks)
elements = partition_ppt(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
)
elif filetype == FileType.PPTX:
return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks)
elements = partition_pptx(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
)
elif filetype == FileType.JSON:
return partition_json(filename=filename, file=file)
elements = partition_json(filename=filename, file=file)
else:
msg = "Invalid file" if not filename else f"Invalid file {filename}"
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
for element in elements:
element.metadata.url = url
return elements
def file_and_type_from_url(
url: str,
content_type: Optional[str] = None,
) -> Tuple[io.BytesIO, Optional[FileType]]:
response = requests.get(url)
file = io.BytesIO(response.content)
content_type = content_type or response.headers.get("Content-Type")
filetype = detect_filetype(file=file, content_type=content_type)
return file, filetype