feat: partition_org for Org Mode documents (#780)

* feat: partition_org for Org Mode documents * update version
2025-12-24 13:44:05 +00:00 · 2023-06-23 20:45:31 +02:00 · 2023-06-23 20:45:31 +02:00 · 752e78e803
commit 752e78e803
parent 5320aa681f
11 changed files with 133 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,13 @@
+## 0.7.9-dev0
+
+### Enhancements
+
+### Features
+
+* Adds `partition_org` for processed Org Mode documents.
+
+### Fixes
+
 ## 0.7.8

 ### Enhancements
--- a/README.md
+++ b/README.md
@ -95,6 +95,7 @@ about the library.
 | HTML Pages (`.html`) | `partition_html` | N/A | No | Encoding; Include Page Breaks |
 | Images (`.png`/`.jpg`) | `partition_image` | `"auto"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
 | Markdown (`.md`) | `partitin_md` | N/A | Yes | Include Page Breaks |
+| Org Mode (`.org`) | `partition_org` | N/A | Yes | Include Page Breaks |
 | Open Office Documents (`.odt`) | `partition_odt` | N/A | Yes | None |
 | PDFs (`.pdf`) | `partition_pdf` | `"auto"`, `"fast"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
 | Plain Text (`.txt`) | `partition_text` | N/A | No | Encoding, Paragraph Grouper |
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -590,6 +590,24 @@ Examples:
      elements = partition_pptx(file=f)


+``partition_org``
+---------------------
+
+The ``partition_org`` function processes Org Mode (``.org``) documents. The function
+first converts the document to HTML using ``pandoc`` and then calls ``partition_html``.
+You'll need `pandoc <https://pandoc.org/installing.html>`_ installed on your system
+to use ``partition_org``.
+
+
+Examples:
+
+.. code:: python
+
+  from unstructured.partition.org import partition_org
+
+  elements = partition_org(filename="example-docs/README.org")
+
+
 ``partition_rst``
 ---------------------

@ -607,7 +625,6 @@ Examples:

  elements = partition_rst(filename="example-docs/README.rst")

-
 ``partition_rtf``
 ---------------------

--- a/example-docs/README.org
+++ b/example-docs/README.org
@ -0,0 +1,27 @@
+* Example Docs
+
+The sample docs directory contains the following files:
+
+-  ~example-10k.html~ - A 10-K SEC filing in HTML format
+-  ~layout-parser-paper.pdf~ - A PDF copy of the layout parser paper
+-  ~factbook.xml~ / ~factbook.xsl~ - Example XML/XLS files that you
+   can use to test stylesheets
+
+These documents can be used to test out the parsers in the library. In
+addition, here are instructions for pulling in some sample docs that are
+too big to store in the repo.
+
+** XBRL 10-K
+
+You can get an example 10-K in inline XBRL format using the following
+~curl~. Note, you need to have the user agent set in the header or the
+SEC site will reject your request.
+
+#+BEGIN_SRC bash
+
+   curl -O \
+     -A '${organization} ${email}'
+     https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt
+#+END_SRC
+
+You can parse this document using the HTML parser.
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -45,6 +45,7 @@ XLSX_MIME_TYPES = [
        ("fake-power-point.pptx", FileType.PPTX),
        ("winter-sports.epub", FileType.EPUB),
        ("spring-weather.html.json", FileType.JSON),
+        ("README.org", FileType.ORG),
        ("README.rst", FileType.RST),
        ("README.md", FileType.MD),
        ("fake.odt", FileType.ODT),
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -784,6 +784,21 @@ def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
        assert partition(file=f) == []


+def test_auto_partition_org_from_filename(filename="example-docs/README.org"):
+    elements = partition(filename=filename)
+
+    assert elements[0] == Title("Example Docs")
+    assert elements[0].metadata.filetype == "text/org"
+
+
+def test_auto_partition_org_from_file(filename="example-docs/README.org"):
+    with open(filename, "rb") as f:
+        elements = partition(file=f, content_type="text/org")
+
+    assert elements[0] == Title("Example Docs")
+    assert elements[0].metadata.filetype == "text/org"
+
+
 def test_auto_partition_rst_from_filename(filename="example-docs/README.rst"):
    elements = partition(filename=filename)

--- a/test_unstructured/partition/test_org.py
+++ b/test_unstructured/partition/test_org.py
@ -0,0 +1,17 @@
+from unstructured.documents.elements import Title
+from unstructured.partition.org import partition_org
+
+
+def test_partition_org_from_filename(filename="example-docs/README.org"):
+    elements = partition_org(filename=filename)
+
+    assert elements[0] == Title("Example Docs")
+    assert elements[0].metadata.filetype == "text/org"
+
+
+def test_partition_org_from_file(filename="example-docs/README.org"):
+    with open(filename, "rb") as f:
+        elements = partition_org(file=f)
+
+    assert elements[0] == Title("Example Docs")
+    assert elements[0].metadata.filetype == "text/org"
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.7.8"  # pragma: no cover
+__version__ = "0.7.9-dev0"  # pragma: no cover
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -89,6 +89,7 @@ class FileType(Enum):
    MD = 52
    EPUB = 53
    RST = 54
+    ORG = 55

    # Compressed Types
    ZIP = 60
@ -117,6 +118,7 @@ STR_TO_FILETYPE = {
    "text/tsv": FileType.TSV,
    "text/markdown": FileType.MD,
    "text/x-markdown": FileType.MD,
+    "text/org": FileType.ORG,
    "text/x-rst": FileType.RST,
    "application/epub": FileType.EPUB,
    "application/epub+zip": FileType.EPUB,
@ -161,6 +163,7 @@ EXT_TO_FILETYPE = {
    ".htm": FileType.HTML,
    ".html": FileType.HTML,
    ".md": FileType.MD,
+    ".org": FileType.ORG,
    ".rst": FileType.RST,
    ".xlsx": FileType.XLSX,
    ".pptx": FileType.PPTX,
@ -289,7 +292,7 @@ def detect_filetype(
        if file and _check_eml_from_buffer(file=file) is True:
            return FileType.EML

-        if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".tsv", ".json"]:
+        if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".org", ".tsv", ".json"]:
            return EXT_TO_FILETYPE.get(extension)

        # Safety catch
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -23,6 +23,7 @@ from unstructured.partition.json import partition_json
 from unstructured.partition.md import partition_md
 from unstructured.partition.msg import partition_msg
 from unstructured.partition.odt import partition_odt
+from unstructured.partition.org import partition_org
 from unstructured.partition.pdf import partition_pdf
 from unstructured.partition.ppt import partition_ppt
 from unstructured.partition.pptx import partition_pptx
@ -154,6 +155,13 @@ def partition(
            include_page_breaks=include_page_breaks,
            **kwargs,
        )
+    elif filetype == FileType.ORG:
+        elements = partition_org(
+            filename=filename,
+            file=file,
+            include_page_breaks=include_page_breaks,
+            **kwargs,
+        )
    elif filetype == FileType.RST:
        elements = partition_rst(
            filename=filename,
--- a/unstructured/partition/org.py
+++ b/unstructured/partition/org.py
@ -0,0 +1,31 @@
+from typing import IO, List, Optional
+
+from unstructured.documents.elements import Element
+from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
+from unstructured.partition.html import convert_and_partition_html
+
+
+@add_metadata_with_filetype(FileType.ORG)
+def partition_org(
+    filename: Optional[str] = None,
+    file: Optional[IO] = None,
+    include_page_breaks: bool = False,
+) -> List[Element]:
+    """Partitions an org document. The document is first converted to HTML and then
+    partitioned using partition_html.
+
+    Parameters
+    ----------
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object using "rb" mode --> open(filename, "rb").
+    include_page_breaks
+        If True, the output will include page breaks if the filetype supports it
+    """
+    return convert_and_partition_html(
+        source_format="org",
+        filename=filename,
+        file=file,
+        include_page_breaks=include_page_breaks,
+    )