feat: partition_org for Org Mode documents (#780)

* feat: partition_org for Org Mode documents * update version
2025-06-27 02:30:08 +00:00 · 2023-06-23 20:45:31 +02:00 · 2023-06-23 20:45:31 +02:00 · 752e78e803
commit 752e78e803
parent 5320aa681f
11 changed files with 133 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,13 @@
 ## 0.7.9-dev0
 ### Enhancements
 ### Features
 * Adds `partition_org` for processed Org Mode documents.
 ### Fixes
 ## 0.7.8
 ### Enhancements
--- a/README.md
+++ b/README.md
@ -95,6 +95,7 @@ about the library.
 | HTML Pages (`.html`) | `partition_html` | N/A | No | Encoding; Include Page Breaks |
 | Images (`.png`/`.jpg`) | `partition_image` | `"auto"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
 | Markdown (`.md`) | `partitin_md` | N/A | Yes | Include Page Breaks |
 | Org Mode (`.org`) | `partition_org` | N/A | Yes | Include Page Breaks |
 | Open Office Documents (`.odt`) | `partition_odt` | N/A | Yes | None |
 | PDFs (`.pdf`) | `partition_pdf` | `"auto"`, `"fast"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
 | Plain Text (`.txt`) | `partition_text` | N/A | No | Encoding, Paragraph Grouper |
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -590,6 +590,24 @@ Examples:
      elements = partition_pptx(file=f)
 ``partition_org``
 ---------------------
 The ``partition_org`` function processes Org Mode (``.org``) documents. The function
 first converts the document to HTML using ``pandoc`` and then calls ``partition_html``.
 You'll need `pandoc <https://pandoc.org/installing.html>`_ installed on your system
 to use ``partition_org``.
 Examples:
 .. code:: python
  from unstructured.partition.org import partition_org
  elements = partition_org(filename="example-docs/README.org")
 ``partition_rst``
 ---------------------
@ -607,7 +625,6 @@ Examples:
  elements = partition_rst(filename="example-docs/README.rst")
 ``partition_rtf``
 ---------------------
--- a/example-docs/README.org
+++ b/example-docs/README.org
@ -0,0 +1,27 @@
 * Example Docs
 The sample docs directory contains the following files:
 -  ~example-10k.html~ - A 10-K SEC filing in HTML format
 -  ~layout-parser-paper.pdf~ - A PDF copy of the layout parser paper
 -  ~factbook.xml~ / ~factbook.xsl~ - Example XML/XLS files that you
   can use to test stylesheets
 These documents can be used to test out the parsers in the library. In
 addition, here are instructions for pulling in some sample docs that are
 too big to store in the repo.
 ** XBRL 10-K
 You can get an example 10-K in inline XBRL format using the following
 ~curl~. Note, you need to have the user agent set in the header or the
 SEC site will reject your request.
 #+BEGIN_SRC bash
   curl -O \
     -A '${organization} ${email}'
     https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt
 #+END_SRC
 You can parse this document using the HTML parser.
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -45,6 +45,7 @@ XLSX_MIME_TYPES = [
        ("fake-power-point.pptx", FileType.PPTX),
        ("winter-sports.epub", FileType.EPUB),
        ("spring-weather.html.json", FileType.JSON),
        ("README.org", FileType.ORG),
        ("README.rst", FileType.RST),
        ("README.md", FileType.MD),
        ("fake.odt", FileType.ODT),
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -784,6 +784,21 @@ def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
        assert partition(file=f) == []
 def test_auto_partition_org_from_filename(filename="example-docs/README.org"):
    elements = partition(filename=filename)
    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/org"
 def test_auto_partition_org_from_file(filename="example-docs/README.org"):
    with open(filename, "rb") as f:
        elements = partition(file=f, content_type="text/org")
    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/org"
 def test_auto_partition_rst_from_filename(filename="example-docs/README.rst"):
    elements = partition(filename=filename)
--- a/test_unstructured/partition/test_org.py
+++ b/test_unstructured/partition/test_org.py
@ -0,0 +1,17 @@
 from unstructured.documents.elements import Title
 from unstructured.partition.org import partition_org
 def test_partition_org_from_filename(filename="example-docs/README.org"):
    elements = partition_org(filename=filename)
    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/org"
 def test_partition_org_from_file(filename="example-docs/README.org"):
    with open(filename, "rb") as f:
        elements = partition_org(file=f)
    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/org"
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.7.8"  # pragma: no cover
+__version__ = "0.7.9-dev0"  # pragma: no cover
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -89,6 +89,7 @@ class FileType(Enum):
    MD = 52
    EPUB = 53
    RST = 54
    ORG = 55
    # Compressed Types
    ZIP = 60
@ -117,6 +118,7 @@ STR_TO_FILETYPE = {
    "text/tsv": FileType.TSV,
    "text/markdown": FileType.MD,
    "text/x-markdown": FileType.MD,
    "text/org": FileType.ORG,
    "text/x-rst": FileType.RST,
    "application/epub": FileType.EPUB,
    "application/epub+zip": FileType.EPUB,
@ -161,6 +163,7 @@ EXT_TO_FILETYPE = {
    ".htm": FileType.HTML,
    ".html": FileType.HTML,
    ".md": FileType.MD,
    ".org": FileType.ORG,
    ".rst": FileType.RST,
    ".xlsx": FileType.XLSX,
    ".pptx": FileType.PPTX,
@ -289,7 +292,7 @@ def detect_filetype(
        if file and _check_eml_from_buffer(file=file) is True:
            return FileType.EML
-        if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".tsv", ".json"]:
+        if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".org", ".tsv", ".json"]:
            return EXT_TO_FILETYPE.get(extension)
        # Safety catch
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -23,6 +23,7 @@ from unstructured.partition.json import partition_json
 from unstructured.partition.md import partition_md
 from unstructured.partition.msg import partition_msg
 from unstructured.partition.odt import partition_odt
 from unstructured.partition.org import partition_org
 from unstructured.partition.pdf import partition_pdf
 from unstructured.partition.ppt import partition_ppt
 from unstructured.partition.pptx import partition_pptx
@ -154,6 +155,13 @@ def partition(
            include_page_breaks=include_page_breaks,
            **kwargs,
        )
    elif filetype == FileType.ORG:
        elements = partition_org(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
            **kwargs,
        )
    elif filetype == FileType.RST:
        elements = partition_rst(
            filename=filename,
--- a/unstructured/partition/org.py
+++ b/unstructured/partition/org.py
@ -0,0 +1,31 @@
 from typing import IO, List, Optional
 from unstructured.documents.elements import Element
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.html import convert_and_partition_html
@add_metadata_with_filetype(FileType.ORG)
 def partition_org(
    filename: Optional[str] = None,
    file: Optional[IO] = None,
    include_page_breaks: bool = False,
 ) -> List[Element]:
    """Partitions an org document. The document is first converted to HTML and then
    partitioned using partition_html.
    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    include_page_breaks
        If True, the output will include page breaks if the filetype supports it
    """
    return convert_and_partition_html(
        source_format="org",
        filename=filename,
        file=file,
        include_page_breaks=include_page_breaks,
    )
`@ -1 +1 @@`
	`__version__ = "0.7.8" # pragma: no cover`	`__version__ = "0.7.9-dev0" # pragma: no cover`