mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feat: partition_org for Org Mode documents (#780)
* feat: partition_org for Org Mode documents * update version
This commit is contained in:
parent
5320aa681f
commit
752e78e803
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.7.9-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
* Adds `partition_org` for processed Org Mode documents.
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.7.8
|
||||
|
||||
### Enhancements
|
||||
|
@ -95,6 +95,7 @@ about the library.
|
||||
| HTML Pages (`.html`) | `partition_html` | N/A | No | Encoding; Include Page Breaks |
|
||||
| Images (`.png`/`.jpg`) | `partition_image` | `"auto"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
|
||||
| Markdown (`.md`) | `partitin_md` | N/A | Yes | Include Page Breaks |
|
||||
| Org Mode (`.org`) | `partition_org` | N/A | Yes | Include Page Breaks |
|
||||
| Open Office Documents (`.odt`) | `partition_odt` | N/A | Yes | None |
|
||||
| PDFs (`.pdf`) | `partition_pdf` | `"auto"`, `"fast"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
|
||||
| Plain Text (`.txt`) | `partition_text` | N/A | No | Encoding, Paragraph Grouper |
|
||||
|
@ -590,6 +590,24 @@ Examples:
|
||||
elements = partition_pptx(file=f)
|
||||
|
||||
|
||||
``partition_org``
|
||||
---------------------
|
||||
|
||||
The ``partition_org`` function processes Org Mode (``.org``) documents. The function
|
||||
first converts the document to HTML using ``pandoc`` and then calls ``partition_html``.
|
||||
You'll need `pandoc <https://pandoc.org/installing.html>`_ installed on your system
|
||||
to use ``partition_org``.
|
||||
|
||||
|
||||
Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.partition.org import partition_org
|
||||
|
||||
elements = partition_org(filename="example-docs/README.org")
|
||||
|
||||
|
||||
``partition_rst``
|
||||
---------------------
|
||||
|
||||
@ -607,7 +625,6 @@ Examples:
|
||||
|
||||
elements = partition_rst(filename="example-docs/README.rst")
|
||||
|
||||
|
||||
``partition_rtf``
|
||||
---------------------
|
||||
|
||||
|
27
example-docs/README.org
Normal file
27
example-docs/README.org
Normal file
@ -0,0 +1,27 @@
|
||||
* Example Docs
|
||||
|
||||
The sample docs directory contains the following files:
|
||||
|
||||
- ~example-10k.html~ - A 10-K SEC filing in HTML format
|
||||
- ~layout-parser-paper.pdf~ - A PDF copy of the layout parser paper
|
||||
- ~factbook.xml~ / ~factbook.xsl~ - Example XML/XLS files that you
|
||||
can use to test stylesheets
|
||||
|
||||
These documents can be used to test out the parsers in the library. In
|
||||
addition, here are instructions for pulling in some sample docs that are
|
||||
too big to store in the repo.
|
||||
|
||||
** XBRL 10-K
|
||||
|
||||
You can get an example 10-K in inline XBRL format using the following
|
||||
~curl~. Note, you need to have the user agent set in the header or the
|
||||
SEC site will reject your request.
|
||||
|
||||
#+BEGIN_SRC bash
|
||||
|
||||
curl -O \
|
||||
-A '${organization} ${email}'
|
||||
https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt
|
||||
#+END_SRC
|
||||
|
||||
You can parse this document using the HTML parser.
|
@ -45,6 +45,7 @@ XLSX_MIME_TYPES = [
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
("winter-sports.epub", FileType.EPUB),
|
||||
("spring-weather.html.json", FileType.JSON),
|
||||
("README.org", FileType.ORG),
|
||||
("README.rst", FileType.RST),
|
||||
("README.md", FileType.MD),
|
||||
("fake.odt", FileType.ODT),
|
||||
|
@ -784,6 +784,21 @@ def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
|
||||
assert partition(file=f) == []
|
||||
|
||||
|
||||
def test_auto_partition_org_from_filename(filename="example-docs/README.org"):
|
||||
elements = partition(filename=filename)
|
||||
|
||||
assert elements[0] == Title("Example Docs")
|
||||
assert elements[0].metadata.filetype == "text/org"
|
||||
|
||||
|
||||
def test_auto_partition_org_from_file(filename="example-docs/README.org"):
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(file=f, content_type="text/org")
|
||||
|
||||
assert elements[0] == Title("Example Docs")
|
||||
assert elements[0].metadata.filetype == "text/org"
|
||||
|
||||
|
||||
def test_auto_partition_rst_from_filename(filename="example-docs/README.rst"):
|
||||
elements = partition(filename=filename)
|
||||
|
||||
|
17
test_unstructured/partition/test_org.py
Normal file
17
test_unstructured/partition/test_org.py
Normal file
@ -0,0 +1,17 @@
|
||||
from unstructured.documents.elements import Title
|
||||
from unstructured.partition.org import partition_org
|
||||
|
||||
|
||||
def test_partition_org_from_filename(filename="example-docs/README.org"):
|
||||
elements = partition_org(filename=filename)
|
||||
|
||||
assert elements[0] == Title("Example Docs")
|
||||
assert elements[0].metadata.filetype == "text/org"
|
||||
|
||||
|
||||
def test_partition_org_from_file(filename="example-docs/README.org"):
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_org(file=f)
|
||||
|
||||
assert elements[0] == Title("Example Docs")
|
||||
assert elements[0].metadata.filetype == "text/org"
|
@ -1 +1 @@
|
||||
__version__ = "0.7.8" # pragma: no cover
|
||||
__version__ = "0.7.9-dev0" # pragma: no cover
|
||||
|
@ -89,6 +89,7 @@ class FileType(Enum):
|
||||
MD = 52
|
||||
EPUB = 53
|
||||
RST = 54
|
||||
ORG = 55
|
||||
|
||||
# Compressed Types
|
||||
ZIP = 60
|
||||
@ -117,6 +118,7 @@ STR_TO_FILETYPE = {
|
||||
"text/tsv": FileType.TSV,
|
||||
"text/markdown": FileType.MD,
|
||||
"text/x-markdown": FileType.MD,
|
||||
"text/org": FileType.ORG,
|
||||
"text/x-rst": FileType.RST,
|
||||
"application/epub": FileType.EPUB,
|
||||
"application/epub+zip": FileType.EPUB,
|
||||
@ -161,6 +163,7 @@ EXT_TO_FILETYPE = {
|
||||
".htm": FileType.HTML,
|
||||
".html": FileType.HTML,
|
||||
".md": FileType.MD,
|
||||
".org": FileType.ORG,
|
||||
".rst": FileType.RST,
|
||||
".xlsx": FileType.XLSX,
|
||||
".pptx": FileType.PPTX,
|
||||
@ -289,7 +292,7 @@ def detect_filetype(
|
||||
if file and _check_eml_from_buffer(file=file) is True:
|
||||
return FileType.EML
|
||||
|
||||
if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".tsv", ".json"]:
|
||||
if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".org", ".tsv", ".json"]:
|
||||
return EXT_TO_FILETYPE.get(extension)
|
||||
|
||||
# Safety catch
|
||||
|
@ -23,6 +23,7 @@ from unstructured.partition.json import partition_json
|
||||
from unstructured.partition.md import partition_md
|
||||
from unstructured.partition.msg import partition_msg
|
||||
from unstructured.partition.odt import partition_odt
|
||||
from unstructured.partition.org import partition_org
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
from unstructured.partition.ppt import partition_ppt
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
@ -154,6 +155,13 @@ def partition(
|
||||
include_page_breaks=include_page_breaks,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.ORG:
|
||||
elements = partition_org(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.RST:
|
||||
elements = partition_rst(
|
||||
filename=filename,
|
||||
|
31
unstructured/partition/org.py
Normal file
31
unstructured/partition/org.py
Normal file
@ -0,0 +1,31 @@
|
||||
from typing import IO, List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.html import convert_and_partition_html
|
||||
|
||||
|
||||
@add_metadata_with_filetype(FileType.ORG)
|
||||
def partition_org(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
include_page_breaks: bool = False,
|
||||
) -> List[Element]:
|
||||
"""Partitions an org document. The document is first converted to HTML and then
|
||||
partitioned using partition_html.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
include_page_breaks
|
||||
If True, the output will include page breaks if the filetype supports it
|
||||
"""
|
||||
return convert_and_partition_html(
|
||||
source_format="org",
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
)
|
Loading…
x
Reference in New Issue
Block a user