mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-28 15:45:21 +00:00
feat: add url kwarg to partititon (#470)
* added url option to auto partition * add test for partition from url * version and changelog * update docs * add url to element metadata
This commit is contained in:
parent
2110a266c8
commit
e2e473dddd
@ -1,4 +1,4 @@
|
||||
## 0.5.12-dev5
|
||||
## 0.5.12
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
|
||||
* Add --partition-by-api parameter to unstructured-ingest
|
||||
* Added `partition_rtf` for processing rich text files.
|
||||
* `partition` now accepts a `url` kwarg in addition to `file` and `filename`.
|
||||
|
||||
### Fixes
|
||||
|
||||
|
||||
@ -116,6 +116,21 @@ faster processing and `"hi_res"` for
|
||||
elements = partition(filename="example-docs/layout-parser-paper-fast.pdf")
|
||||
|
||||
|
||||
The ``partition`` function also accepts a ``url`` kwarg for remotely hosted documents. If you want
|
||||
to force ``partition`` to treat the document as a particular MIME type, use the ``content_type``
|
||||
kwarg in conjunction with ``url``. Otherwise, ``partition`` will use the information from
|
||||
the ``Content-Type`` header in the HTTP response.
|
||||
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
|
||||
elements = partition(url=url)
|
||||
elements = partition(url=url, content_type="text/markdown")
|
||||
|
||||
|
||||
``partition_docx``
|
||||
------------------
|
||||
|
||||
|
||||
@ -367,3 +367,10 @@ def test_auto_partition_rtf_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
|
||||
elements = partition(filename=filename)
|
||||
assert elements[0] == Title("My First Heading")
|
||||
|
||||
|
||||
def test_auto_partition_from_url():
|
||||
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
|
||||
elements = partition(url=url, content_type="text/plain")
|
||||
assert elements[0] == Title("Apache License")
|
||||
assert elements[0].metadata.url == url
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.5.12-dev5" # pragma: no cover
|
||||
__version__ = "0.5.12" # pragma: no cover
|
||||
|
||||
@ -1,6 +1,10 @@
|
||||
from typing import IO, Callable, Optional
|
||||
import io
|
||||
from typing import IO, Callable, Optional, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
from unstructured.file_utils.filetype import FileType, detect_filetype
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.partition.doc import partition_doc
|
||||
from unstructured.partition.docx import partition_docx
|
||||
from unstructured.partition.email import partition_email
|
||||
@ -22,6 +26,7 @@ def partition(
|
||||
content_type: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
file_filename: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
include_page_breaks: bool = False,
|
||||
strategy: str = "hi_res",
|
||||
encoding: str = "utf-8",
|
||||
@ -42,6 +47,9 @@ def partition(
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
file_filename
|
||||
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
||||
url
|
||||
The url for a remote document. Pass in content_type if you want partition to treat
|
||||
the document as a specific content_type.
|
||||
include_page_breaks
|
||||
If True, the output will include page breaks if the filetype supports it
|
||||
strategy
|
||||
@ -51,37 +59,50 @@ def partition(
|
||||
encoding
|
||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||
"""
|
||||
filetype = detect_filetype(
|
||||
filename=filename,
|
||||
file=file,
|
||||
file_filename=file_filename,
|
||||
content_type=content_type,
|
||||
)
|
||||
exactly_one(file=file, filename=filename, url=url)
|
||||
|
||||
if url is not None:
|
||||
file, filetype = file_and_type_from_url(url=url, content_type=content_type)
|
||||
else:
|
||||
filetype = detect_filetype(
|
||||
filename=filename,
|
||||
file=file,
|
||||
file_filename=file_filename,
|
||||
content_type=content_type,
|
||||
)
|
||||
|
||||
if file is not None:
|
||||
file.seek(0)
|
||||
|
||||
if filetype == FileType.DOC:
|
||||
return partition_doc(filename=filename, file=file)
|
||||
if filetype == FileType.DOCX:
|
||||
return partition_docx(filename=filename, file=file)
|
||||
elements = partition_doc(filename=filename, file=file)
|
||||
elif filetype == FileType.DOCX:
|
||||
elements = partition_docx(filename=filename, file=file)
|
||||
elif filetype == FileType.EML:
|
||||
return partition_email(filename=filename, file=file, encoding=encoding)
|
||||
elements = partition_email(filename=filename, file=file, encoding=encoding)
|
||||
elif filetype == FileType.MSG:
|
||||
return partition_msg(filename=filename, file=file)
|
||||
elements = partition_msg(filename=filename, file=file)
|
||||
elif filetype == FileType.HTML:
|
||||
return partition_html(
|
||||
elements = partition_html(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
encoding=encoding,
|
||||
)
|
||||
elif filetype == FileType.EPUB:
|
||||
return partition_epub(filename=filename, file=file, include_page_breaks=include_page_breaks)
|
||||
elements = partition_epub(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
)
|
||||
elif filetype == FileType.MD:
|
||||
return partition_md(filename=filename, file=file, include_page_breaks=include_page_breaks)
|
||||
elements = partition_md(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
)
|
||||
elif filetype == FileType.PDF:
|
||||
return partition_pdf(
|
||||
elements = partition_pdf(
|
||||
filename=filename, # type: ignore
|
||||
file=file, # type: ignore
|
||||
url=None,
|
||||
@ -90,27 +111,56 @@ def partition(
|
||||
strategy=strategy,
|
||||
)
|
||||
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
|
||||
return partition_image(
|
||||
elements = partition_image(
|
||||
filename=filename, # type: ignore
|
||||
file=file, # type: ignore
|
||||
url=None,
|
||||
include_page_breaks=include_page_breaks,
|
||||
)
|
||||
elif filetype == FileType.TXT:
|
||||
return partition_text(
|
||||
elements = partition_text(
|
||||
filename=filename,
|
||||
file=file,
|
||||
encoding=encoding,
|
||||
paragraph_grouper=paragraph_grouper,
|
||||
)
|
||||
elif filetype == FileType.RTF:
|
||||
return partition_rtf(filename=filename, file=file, include_page_breaks=include_page_breaks)
|
||||
elements = partition_rtf(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
)
|
||||
elif filetype == FileType.PPT:
|
||||
return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks)
|
||||
elements = partition_ppt(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
)
|
||||
elif filetype == FileType.PPTX:
|
||||
return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks)
|
||||
elements = partition_pptx(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
)
|
||||
elif filetype == FileType.JSON:
|
||||
return partition_json(filename=filename, file=file)
|
||||
elements = partition_json(filename=filename, file=file)
|
||||
else:
|
||||
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
||||
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
|
||||
|
||||
for element in elements:
|
||||
element.metadata.url = url
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
def file_and_type_from_url(
|
||||
url: str,
|
||||
content_type: Optional[str] = None,
|
||||
) -> Tuple[io.BytesIO, Optional[FileType]]:
|
||||
response = requests.get(url)
|
||||
file = io.BytesIO(response.content)
|
||||
|
||||
content_type = content_type or response.headers.get("Content-Type")
|
||||
filetype = detect_filetype(file=file, content_type=content_type)
|
||||
return file, filetype
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user