mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feat: partition_org for Org Mode documents (#780)
* feat: partition_org for Org Mode documents * update version
This commit is contained in:
parent
5320aa681f
commit
752e78e803
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
|||||||
|
## 0.7.9-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
* Adds `partition_org` for processed Org Mode documents.
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
## 0.7.8
|
## 0.7.8
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
@ -95,6 +95,7 @@ about the library.
|
|||||||
| HTML Pages (`.html`) | `partition_html` | N/A | No | Encoding; Include Page Breaks |
|
| HTML Pages (`.html`) | `partition_html` | N/A | No | Encoding; Include Page Breaks |
|
||||||
| Images (`.png`/`.jpg`) | `partition_image` | `"auto"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
|
| Images (`.png`/`.jpg`) | `partition_image` | `"auto"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
|
||||||
| Markdown (`.md`) | `partitin_md` | N/A | Yes | Include Page Breaks |
|
| Markdown (`.md`) | `partitin_md` | N/A | Yes | Include Page Breaks |
|
||||||
|
| Org Mode (`.org`) | `partition_org` | N/A | Yes | Include Page Breaks |
|
||||||
| Open Office Documents (`.odt`) | `partition_odt` | N/A | Yes | None |
|
| Open Office Documents (`.odt`) | `partition_odt` | N/A | Yes | None |
|
||||||
| PDFs (`.pdf`) | `partition_pdf` | `"auto"`, `"fast"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
|
| PDFs (`.pdf`) | `partition_pdf` | `"auto"`, `"fast"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
|
||||||
| Plain Text (`.txt`) | `partition_text` | N/A | No | Encoding, Paragraph Grouper |
|
| Plain Text (`.txt`) | `partition_text` | N/A | No | Encoding, Paragraph Grouper |
|
||||||
|
@ -590,6 +590,24 @@ Examples:
|
|||||||
elements = partition_pptx(file=f)
|
elements = partition_pptx(file=f)
|
||||||
|
|
||||||
|
|
||||||
|
``partition_org``
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
The ``partition_org`` function processes Org Mode (``.org``) documents. The function
|
||||||
|
first converts the document to HTML using ``pandoc`` and then calls ``partition_html``.
|
||||||
|
You'll need `pandoc <https://pandoc.org/installing.html>`_ installed on your system
|
||||||
|
to use ``partition_org``.
|
||||||
|
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.partition.org import partition_org
|
||||||
|
|
||||||
|
elements = partition_org(filename="example-docs/README.org")
|
||||||
|
|
||||||
|
|
||||||
``partition_rst``
|
``partition_rst``
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
@ -607,7 +625,6 @@ Examples:
|
|||||||
|
|
||||||
elements = partition_rst(filename="example-docs/README.rst")
|
elements = partition_rst(filename="example-docs/README.rst")
|
||||||
|
|
||||||
|
|
||||||
``partition_rtf``
|
``partition_rtf``
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
27
example-docs/README.org
Normal file
27
example-docs/README.org
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
* Example Docs
|
||||||
|
|
||||||
|
The sample docs directory contains the following files:
|
||||||
|
|
||||||
|
- ~example-10k.html~ - A 10-K SEC filing in HTML format
|
||||||
|
- ~layout-parser-paper.pdf~ - A PDF copy of the layout parser paper
|
||||||
|
- ~factbook.xml~ / ~factbook.xsl~ - Example XML/XLS files that you
|
||||||
|
can use to test stylesheets
|
||||||
|
|
||||||
|
These documents can be used to test out the parsers in the library. In
|
||||||
|
addition, here are instructions for pulling in some sample docs that are
|
||||||
|
too big to store in the repo.
|
||||||
|
|
||||||
|
** XBRL 10-K
|
||||||
|
|
||||||
|
You can get an example 10-K in inline XBRL format using the following
|
||||||
|
~curl~. Note, you need to have the user agent set in the header or the
|
||||||
|
SEC site will reject your request.
|
||||||
|
|
||||||
|
#+BEGIN_SRC bash
|
||||||
|
|
||||||
|
curl -O \
|
||||||
|
-A '${organization} ${email}'
|
||||||
|
https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt
|
||||||
|
#+END_SRC
|
||||||
|
|
||||||
|
You can parse this document using the HTML parser.
|
@ -45,6 +45,7 @@ XLSX_MIME_TYPES = [
|
|||||||
("fake-power-point.pptx", FileType.PPTX),
|
("fake-power-point.pptx", FileType.PPTX),
|
||||||
("winter-sports.epub", FileType.EPUB),
|
("winter-sports.epub", FileType.EPUB),
|
||||||
("spring-weather.html.json", FileType.JSON),
|
("spring-weather.html.json", FileType.JSON),
|
||||||
|
("README.org", FileType.ORG),
|
||||||
("README.rst", FileType.RST),
|
("README.rst", FileType.RST),
|
||||||
("README.md", FileType.MD),
|
("README.md", FileType.MD),
|
||||||
("fake.odt", FileType.ODT),
|
("fake.odt", FileType.ODT),
|
||||||
|
@ -784,6 +784,21 @@ def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
|
|||||||
assert partition(file=f) == []
|
assert partition(file=f) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_partition_org_from_filename(filename="example-docs/README.org"):
|
||||||
|
elements = partition(filename=filename)
|
||||||
|
|
||||||
|
assert elements[0] == Title("Example Docs")
|
||||||
|
assert elements[0].metadata.filetype == "text/org"
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_partition_org_from_file(filename="example-docs/README.org"):
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
elements = partition(file=f, content_type="text/org")
|
||||||
|
|
||||||
|
assert elements[0] == Title("Example Docs")
|
||||||
|
assert elements[0].metadata.filetype == "text/org"
|
||||||
|
|
||||||
|
|
||||||
def test_auto_partition_rst_from_filename(filename="example-docs/README.rst"):
|
def test_auto_partition_rst_from_filename(filename="example-docs/README.rst"):
|
||||||
elements = partition(filename=filename)
|
elements = partition(filename=filename)
|
||||||
|
|
||||||
|
17
test_unstructured/partition/test_org.py
Normal file
17
test_unstructured/partition/test_org.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
from unstructured.documents.elements import Title
|
||||||
|
from unstructured.partition.org import partition_org
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_org_from_filename(filename="example-docs/README.org"):
|
||||||
|
elements = partition_org(filename=filename)
|
||||||
|
|
||||||
|
assert elements[0] == Title("Example Docs")
|
||||||
|
assert elements[0].metadata.filetype == "text/org"
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_org_from_file(filename="example-docs/README.org"):
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
elements = partition_org(file=f)
|
||||||
|
|
||||||
|
assert elements[0] == Title("Example Docs")
|
||||||
|
assert elements[0].metadata.filetype == "text/org"
|
@ -1 +1 @@
|
|||||||
__version__ = "0.7.8" # pragma: no cover
|
__version__ = "0.7.9-dev0" # pragma: no cover
|
||||||
|
@ -89,6 +89,7 @@ class FileType(Enum):
|
|||||||
MD = 52
|
MD = 52
|
||||||
EPUB = 53
|
EPUB = 53
|
||||||
RST = 54
|
RST = 54
|
||||||
|
ORG = 55
|
||||||
|
|
||||||
# Compressed Types
|
# Compressed Types
|
||||||
ZIP = 60
|
ZIP = 60
|
||||||
@ -117,6 +118,7 @@ STR_TO_FILETYPE = {
|
|||||||
"text/tsv": FileType.TSV,
|
"text/tsv": FileType.TSV,
|
||||||
"text/markdown": FileType.MD,
|
"text/markdown": FileType.MD,
|
||||||
"text/x-markdown": FileType.MD,
|
"text/x-markdown": FileType.MD,
|
||||||
|
"text/org": FileType.ORG,
|
||||||
"text/x-rst": FileType.RST,
|
"text/x-rst": FileType.RST,
|
||||||
"application/epub": FileType.EPUB,
|
"application/epub": FileType.EPUB,
|
||||||
"application/epub+zip": FileType.EPUB,
|
"application/epub+zip": FileType.EPUB,
|
||||||
@ -161,6 +163,7 @@ EXT_TO_FILETYPE = {
|
|||||||
".htm": FileType.HTML,
|
".htm": FileType.HTML,
|
||||||
".html": FileType.HTML,
|
".html": FileType.HTML,
|
||||||
".md": FileType.MD,
|
".md": FileType.MD,
|
||||||
|
".org": FileType.ORG,
|
||||||
".rst": FileType.RST,
|
".rst": FileType.RST,
|
||||||
".xlsx": FileType.XLSX,
|
".xlsx": FileType.XLSX,
|
||||||
".pptx": FileType.PPTX,
|
".pptx": FileType.PPTX,
|
||||||
@ -289,7 +292,7 @@ def detect_filetype(
|
|||||||
if file and _check_eml_from_buffer(file=file) is True:
|
if file and _check_eml_from_buffer(file=file) is True:
|
||||||
return FileType.EML
|
return FileType.EML
|
||||||
|
|
||||||
if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".tsv", ".json"]:
|
if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".org", ".tsv", ".json"]:
|
||||||
return EXT_TO_FILETYPE.get(extension)
|
return EXT_TO_FILETYPE.get(extension)
|
||||||
|
|
||||||
# Safety catch
|
# Safety catch
|
||||||
|
@ -23,6 +23,7 @@ from unstructured.partition.json import partition_json
|
|||||||
from unstructured.partition.md import partition_md
|
from unstructured.partition.md import partition_md
|
||||||
from unstructured.partition.msg import partition_msg
|
from unstructured.partition.msg import partition_msg
|
||||||
from unstructured.partition.odt import partition_odt
|
from unstructured.partition.odt import partition_odt
|
||||||
|
from unstructured.partition.org import partition_org
|
||||||
from unstructured.partition.pdf import partition_pdf
|
from unstructured.partition.pdf import partition_pdf
|
||||||
from unstructured.partition.ppt import partition_ppt
|
from unstructured.partition.ppt import partition_ppt
|
||||||
from unstructured.partition.pptx import partition_pptx
|
from unstructured.partition.pptx import partition_pptx
|
||||||
@ -154,6 +155,13 @@ def partition(
|
|||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
elif filetype == FileType.ORG:
|
||||||
|
elements = partition_org(
|
||||||
|
filename=filename,
|
||||||
|
file=file,
|
||||||
|
include_page_breaks=include_page_breaks,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
elif filetype == FileType.RST:
|
elif filetype == FileType.RST:
|
||||||
elements = partition_rst(
|
elements = partition_rst(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
|
31
unstructured/partition/org.py
Normal file
31
unstructured/partition/org.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
from typing import IO, List, Optional
|
||||||
|
|
||||||
|
from unstructured.documents.elements import Element
|
||||||
|
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||||
|
from unstructured.partition.html import convert_and_partition_html
|
||||||
|
|
||||||
|
|
||||||
|
@add_metadata_with_filetype(FileType.ORG)
|
||||||
|
def partition_org(
|
||||||
|
filename: Optional[str] = None,
|
||||||
|
file: Optional[IO] = None,
|
||||||
|
include_page_breaks: bool = False,
|
||||||
|
) -> List[Element]:
|
||||||
|
"""Partitions an org document. The document is first converted to HTML and then
|
||||||
|
partitioned using partition_html.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename
|
||||||
|
A string defining the target filename path.
|
||||||
|
file
|
||||||
|
A file-like object using "rb" mode --> open(filename, "rb").
|
||||||
|
include_page_breaks
|
||||||
|
If True, the output will include page breaks if the filetype supports it
|
||||||
|
"""
|
||||||
|
return convert_and_partition_html(
|
||||||
|
source_format="org",
|
||||||
|
filename=filename,
|
||||||
|
file=file,
|
||||||
|
include_page_breaks=include_page_breaks,
|
||||||
|
)
|
Loading…
x
Reference in New Issue
Block a user