feat: add url kwarg to partititon (#470)

* added url option to auto partition * add test for partition from url * version and changelog * update docs * add url to element metadata
2025-12-28 23:58:13 +00:00 · 2023-04-12 14:31:01 -04:00 · 2023-04-12 14:31:01 -04:00 · e2e473dddd
commit e2e473dddd
parent 2110a266c8
5 changed files with 97 additions and 24 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.5.12-dev5
+## 0.5.12

 ### Enhancements

@ -10,6 +10,7 @@

 * Add --partition-by-api parameter to unstructured-ingest
 * Added `partition_rtf` for processing rich text files.
+* `partition` now accepts a `url` kwarg in addition to `file` and `filename`.

 ### Fixes

--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -116,6 +116,21 @@ faster processing and `"hi_res"` for
  elements = partition(filename="example-docs/layout-parser-paper-fast.pdf")


+The ``partition`` function also accepts a ``url`` kwarg for remotely hosted documents. If you want
+to force ``partition`` to treat the document as a particular MIME type, use the ``content_type``
+kwarg in conjunction with ``url``. Otherwise, ``partition`` will use the information from
+the ``Content-Type`` header in the HTTP response.
+
+
+.. code:: python
+
+  from unstructured.partition.auto import partition
+
+  url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
+  elements = partition(url=url)
+  elements = partition(url=url, content_type="text/markdown")
+
+
 ``partition_docx``
 ------------------

--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -367,3 +367,10 @@ def test_auto_partition_rtf_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
    elements = partition(filename=filename)
    assert elements[0] == Title("My First Heading")
+
+
+def test_auto_partition_from_url():
+    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
+    elements = partition(url=url, content_type="text/plain")
+    assert elements[0] == Title("Apache License")
+    assert elements[0].metadata.url == url
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.5.12-dev5"  # pragma: no cover
+__version__ = "0.5.12"  # pragma: no cover
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -1,6 +1,10 @@
-from typing import IO, Callable, Optional
+import io
+from typing import IO, Callable, Optional, Tuple
+
+import requests

 from unstructured.file_utils.filetype import FileType, detect_filetype
+from unstructured.partition.common import exactly_one
 from unstructured.partition.doc import partition_doc
 from unstructured.partition.docx import partition_docx
 from unstructured.partition.email import partition_email
@ -22,6 +26,7 @@ def partition(
    content_type: Optional[str] = None,
    file: Optional[IO] = None,
    file_filename: Optional[str] = None,
+    url: Optional[str] = None,
    include_page_breaks: bool = False,
    strategy: str = "hi_res",
    encoding: str = "utf-8",
@ -42,6 +47,9 @@ def partition(
        A file-like object using "rb" mode --> open(filename, "rb").
    file_filename
        When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
+    url
+        The url for a remote document. Pass in content_type if you want partition to treat
+        the document as a specific content_type.
    include_page_breaks
        If True, the output will include page breaks if the filetype supports it
    strategy
@ -51,37 +59,50 @@ def partition(
    encoding
        The encoding method used to decode the text input. If None, utf-8 will be used.
    """
-    filetype = detect_filetype(
-        filename=filename,
-        file=file,
-        file_filename=file_filename,
-        content_type=content_type,
-    )
+    exactly_one(file=file, filename=filename, url=url)
+
+    if url is not None:
+        file, filetype = file_and_type_from_url(url=url, content_type=content_type)
+    else:
+        filetype = detect_filetype(
+            filename=filename,
+            file=file,
+            file_filename=file_filename,
+            content_type=content_type,
+        )

    if file is not None:
        file.seek(0)

    if filetype == FileType.DOC:
-        return partition_doc(filename=filename, file=file)
-    if filetype == FileType.DOCX:
-        return partition_docx(filename=filename, file=file)
+        elements = partition_doc(filename=filename, file=file)
+    elif filetype == FileType.DOCX:
+        elements = partition_docx(filename=filename, file=file)
    elif filetype == FileType.EML:
-        return partition_email(filename=filename, file=file, encoding=encoding)
+        elements = partition_email(filename=filename, file=file, encoding=encoding)
    elif filetype == FileType.MSG:
-        return partition_msg(filename=filename, file=file)
+        elements = partition_msg(filename=filename, file=file)
    elif filetype == FileType.HTML:
-        return partition_html(
+        elements = partition_html(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
            encoding=encoding,
        )
    elif filetype == FileType.EPUB:
-        return partition_epub(filename=filename, file=file, include_page_breaks=include_page_breaks)
+        elements = partition_epub(
+            filename=filename,
+            file=file,
+            include_page_breaks=include_page_breaks,
+        )
    elif filetype == FileType.MD:
-        return partition_md(filename=filename, file=file, include_page_breaks=include_page_breaks)
+        elements = partition_md(
+            filename=filename,
+            file=file,
+            include_page_breaks=include_page_breaks,
+        )
    elif filetype == FileType.PDF:
-        return partition_pdf(
+        elements = partition_pdf(
            filename=filename,  # type: ignore
            file=file,  # type: ignore
            url=None,
@ -90,27 +111,56 @@ def partition(
            strategy=strategy,
        )
    elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
-        return partition_image(
+        elements = partition_image(
            filename=filename,  # type: ignore
            file=file,  # type: ignore
            url=None,
            include_page_breaks=include_page_breaks,
        )
    elif filetype == FileType.TXT:
-        return partition_text(
+        elements = partition_text(
            filename=filename,
            file=file,
            encoding=encoding,
            paragraph_grouper=paragraph_grouper,
        )
    elif filetype == FileType.RTF:
-        return partition_rtf(filename=filename, file=file, include_page_breaks=include_page_breaks)
+        elements = partition_rtf(
+            filename=filename,
+            file=file,
+            include_page_breaks=include_page_breaks,
+        )
    elif filetype == FileType.PPT:
-        return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks)
+        elements = partition_ppt(
+            filename=filename,
+            file=file,
+            include_page_breaks=include_page_breaks,
+        )
    elif filetype == FileType.PPTX:
-        return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks)
+        elements = partition_pptx(
+            filename=filename,
+            file=file,
+            include_page_breaks=include_page_breaks,
+        )
    elif filetype == FileType.JSON:
-        return partition_json(filename=filename, file=file)
+        elements = partition_json(filename=filename, file=file)
    else:
        msg = "Invalid file" if not filename else f"Invalid file {filename}"
        raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
+
+    for element in elements:
+        element.metadata.url = url
+
+    return elements
+
+
+def file_and_type_from_url(
+    url: str,
+    content_type: Optional[str] = None,
+) -> Tuple[io.BytesIO, Optional[FileType]]:
+    response = requests.get(url)
+    file = io.BytesIO(response.content)
+
+    content_type = content_type or response.headers.get("Content-Type")
+    filetype = detect_filetype(file=file, content_type=content_type)
+    return file, filetype