feat: partition_org for Org Mode documents (#780)

* feat: partition_org for Org Mode documents

* update version
This commit is contained in:
Martin Mauch 2023-06-23 20:45:31 +02:00 committed by GitHub
parent 5320aa681f
commit 752e78e803
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 133 additions and 3 deletions

View File

@ -1,3 +1,13 @@
## 0.7.9-dev0
### Enhancements
### Features
* Adds `partition_org` for processed Org Mode documents.
### Fixes
## 0.7.8
### Enhancements

View File

@ -95,6 +95,7 @@ about the library.
| HTML Pages (`.html`) | `partition_html` | N/A | No | Encoding; Include Page Breaks |
| Images (`.png`/`.jpg`) | `partition_image` | `"auto"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
| Markdown (`.md`) | `partitin_md` | N/A | Yes | Include Page Breaks |
| Org Mode (`.org`) | `partition_org` | N/A | Yes | Include Page Breaks |
| Open Office Documents (`.odt`) | `partition_odt` | N/A | Yes | None |
| PDFs (`.pdf`) | `partition_pdf` | `"auto"`, `"fast"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
| Plain Text (`.txt`) | `partition_text` | N/A | No | Encoding, Paragraph Grouper |

View File

@ -590,6 +590,24 @@ Examples:
elements = partition_pptx(file=f)
``partition_org``
---------------------
The ``partition_org`` function processes Org Mode (``.org``) documents. The function
first converts the document to HTML using ``pandoc`` and then calls ``partition_html``.
You'll need `pandoc <https://pandoc.org/installing.html>`_ installed on your system
to use ``partition_org``.
Examples:
.. code:: python
from unstructured.partition.org import partition_org
elements = partition_org(filename="example-docs/README.org")
``partition_rst``
---------------------
@ -607,7 +625,6 @@ Examples:
elements = partition_rst(filename="example-docs/README.rst")
``partition_rtf``
---------------------

27
example-docs/README.org Normal file
View File

@ -0,0 +1,27 @@
* Example Docs
The sample docs directory contains the following files:
- ~example-10k.html~ - A 10-K SEC filing in HTML format
- ~layout-parser-paper.pdf~ - A PDF copy of the layout parser paper
- ~factbook.xml~ / ~factbook.xsl~ - Example XML/XLS files that you
can use to test stylesheets
These documents can be used to test out the parsers in the library. In
addition, here are instructions for pulling in some sample docs that are
too big to store in the repo.
** XBRL 10-K
You can get an example 10-K in inline XBRL format using the following
~curl~. Note, you need to have the user agent set in the header or the
SEC site will reject your request.
#+BEGIN_SRC bash
curl -O \
-A '${organization} ${email}'
https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt
#+END_SRC
You can parse this document using the HTML parser.

View File

@ -45,6 +45,7 @@ XLSX_MIME_TYPES = [
("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB),
("spring-weather.html.json", FileType.JSON),
("README.org", FileType.ORG),
("README.rst", FileType.RST),
("README.md", FileType.MD),
("fake.odt", FileType.ODT),

View File

@ -784,6 +784,21 @@ def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
assert partition(file=f) == []
def test_auto_partition_org_from_filename(filename="example-docs/README.org"):
elements = partition(filename=filename)
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filetype == "text/org"
def test_auto_partition_org_from_file(filename="example-docs/README.org"):
with open(filename, "rb") as f:
elements = partition(file=f, content_type="text/org")
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filetype == "text/org"
def test_auto_partition_rst_from_filename(filename="example-docs/README.rst"):
elements = partition(filename=filename)

View File

@ -0,0 +1,17 @@
from unstructured.documents.elements import Title
from unstructured.partition.org import partition_org
def test_partition_org_from_filename(filename="example-docs/README.org"):
elements = partition_org(filename=filename)
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filetype == "text/org"
def test_partition_org_from_file(filename="example-docs/README.org"):
with open(filename, "rb") as f:
elements = partition_org(file=f)
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filetype == "text/org"

View File

@ -1 +1 @@
__version__ = "0.7.8" # pragma: no cover
__version__ = "0.7.9-dev0" # pragma: no cover

View File

@ -89,6 +89,7 @@ class FileType(Enum):
MD = 52
EPUB = 53
RST = 54
ORG = 55
# Compressed Types
ZIP = 60
@ -117,6 +118,7 @@ STR_TO_FILETYPE = {
"text/tsv": FileType.TSV,
"text/markdown": FileType.MD,
"text/x-markdown": FileType.MD,
"text/org": FileType.ORG,
"text/x-rst": FileType.RST,
"application/epub": FileType.EPUB,
"application/epub+zip": FileType.EPUB,
@ -161,6 +163,7 @@ EXT_TO_FILETYPE = {
".htm": FileType.HTML,
".html": FileType.HTML,
".md": FileType.MD,
".org": FileType.ORG,
".rst": FileType.RST,
".xlsx": FileType.XLSX,
".pptx": FileType.PPTX,
@ -289,7 +292,7 @@ def detect_filetype(
if file and _check_eml_from_buffer(file=file) is True:
return FileType.EML
if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".tsv", ".json"]:
if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".org", ".tsv", ".json"]:
return EXT_TO_FILETYPE.get(extension)
# Safety catch

View File

@ -23,6 +23,7 @@ from unstructured.partition.json import partition_json
from unstructured.partition.md import partition_md
from unstructured.partition.msg import partition_msg
from unstructured.partition.odt import partition_odt
from unstructured.partition.org import partition_org
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.ppt import partition_ppt
from unstructured.partition.pptx import partition_pptx
@ -154,6 +155,13 @@ def partition(
include_page_breaks=include_page_breaks,
**kwargs,
)
elif filetype == FileType.ORG:
elements = partition_org(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
**kwargs,
)
elif filetype == FileType.RST:
elements = partition_rst(
filename=filename,

View File

@ -0,0 +1,31 @@
from typing import IO, List, Optional
from unstructured.documents.elements import Element
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.html import convert_and_partition_html
@add_metadata_with_filetype(FileType.ORG)
def partition_org(
filename: Optional[str] = None,
file: Optional[IO] = None,
include_page_breaks: bool = False,
) -> List[Element]:
"""Partitions an org document. The document is first converted to HTML and then
partitioned using partition_html.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
include_page_breaks
If True, the output will include page breaks if the filetype supports it
"""
return convert_and_partition_html(
source_format="org",
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
)