mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 14:14:30 +00:00
feat: add ability to extract extra metadata with regex (#763)
* first pass on regex metadata * fix typing for regex metadata * add dataclass back in * add decorators * fix tests * update docs * add tests for regex metadata * add process metadata to tsv * changelog and version * docs typos * consolidate to using a single kwarg * fix test
This commit is contained in:
parent
ec403e245c
commit
4ea716837d
@ -8,6 +8,7 @@
|
||||
|
||||
### Features
|
||||
|
||||
* Provides users with the ability to extract additional metadata via regex.
|
||||
* Updates `partition_docx` to include headers and footers in the output.
|
||||
* Create `partition_tsv` and associated tests. Make additional changes to `detect_filetype`.
|
||||
|
||||
|
||||
@ -15,7 +15,10 @@ Library Documentation
|
||||
Check out this section to learn about basic workflows in ``unstructured``.
|
||||
|
||||
:doc:`bricks`
|
||||
Learning more about partitioning, cleaning, and staging bricks, included advanced usage patterns.
|
||||
Learn more about partitioning, cleaning, and staging bricks, including advanced usage patterns.
|
||||
|
||||
:doc:`metadata`
|
||||
Learn more about how metadata is tracked in the ``unstructured`` library.
|
||||
|
||||
:doc:`examples`
|
||||
Examples of other types of workflows within the ``unstructured`` package.
|
||||
@ -33,5 +36,6 @@ Library Documentation
|
||||
installing
|
||||
getting_started
|
||||
bricks
|
||||
metadata
|
||||
examples
|
||||
integrations
|
||||
|
||||
84
docs/source/metadata.rst
Normal file
84
docs/source/metadata.rst
Normal file
@ -0,0 +1,84 @@
|
||||
Metadata
|
||||
========
|
||||
|
||||
The ``unstructured`` package tracks a variety of metadata about Elements extracted from documents.
|
||||
Tracking metadata enables users to filter document elements downstream based on element metadata of interest.
|
||||
For example, a user may be interested in selected document elements from a given page number
|
||||
or an e-mail with a given subject line.
|
||||
|
||||
Metadata is tracked at the element level. You can extract the metadata for a given document element
|
||||
with ``element.metadata``. For a dictionary representation, use ``element.metadata.to_dict()``.
|
||||
All document types return the following metadata fields when the information is available from
|
||||
the source file:
|
||||
|
||||
* ``filename``
|
||||
* ``file_directory``
|
||||
* ``date``
|
||||
* ``filetype``
|
||||
* ``page_number``
|
||||
|
||||
|
||||
Email
|
||||
-----
|
||||
|
||||
Emails will include ``sent_from``, ``sent_to``, and ``subject`` metadata.
|
||||
``sent_from`` is a list of strings because the `RFC 822 <https://www.rfc-editor.org/rfc/rfc822>`_
|
||||
spec for emails allows for multiple sent from email addresses.
|
||||
|
||||
|
||||
Microsoft Excel Documents
|
||||
--------------------------
|
||||
|
||||
For Excel documents, ``ElementMetadata`` will contain a ``page_name`` element, which corresponds
|
||||
to the sheet name in the Excel document.
|
||||
|
||||
|
||||
Microsoft Word Documents
|
||||
-------------------------
|
||||
|
||||
Headers and footers in Word documents include a ``header_footer_type`` indicating which page
|
||||
a header or footer applies to. Valid values are ``"primary"``, ``"even_only"``, and ``"first_page"``.
|
||||
|
||||
|
||||
Webpages
|
||||
---------
|
||||
|
||||
Elements from webpages will include a ``url`` metadata field, corresponding to the URL for the webpage.
|
||||
|
||||
|
||||
|
||||
##########################
|
||||
Advanced Metadata Options
|
||||
###########################
|
||||
|
||||
|
||||
|
||||
Extract Metadata with Regexes
|
||||
------------------------------
|
||||
|
||||
``unstructured`` allows users to extract additional metadata with regexes using the ``regex_metadata`` kwarg.
|
||||
Here is an example of how to extract regex metadata:
|
||||
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
text = "SPEAKER 1: It is my turn to speak now!"
|
||||
elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}:"})
|
||||
elements[0].metadata.regex_metadata
|
||||
|
||||
The result will look like:
|
||||
|
||||
|
||||
.. code:: python
|
||||
|
||||
{'speaker':
|
||||
[
|
||||
{
|
||||
'text': 'SPEAKER 1:',
|
||||
'start': 0,
|
||||
'end': 10,
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -206,15 +206,18 @@ def test_partition_email_has_metadata():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
|
||||
elements = partition_email(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements[0].metadata == ElementMetadata(
|
||||
filename=filename,
|
||||
date="2022-12-16T17:04:16-05:00",
|
||||
page_number=None,
|
||||
url=None,
|
||||
sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
|
||||
sent_to=["Matthew Robinson <mrobinson@unstructured.io>"],
|
||||
subject="Test Email",
|
||||
filetype="message/rfc822",
|
||||
assert (
|
||||
elements[0].metadata.to_dict()
|
||||
== ElementMetadata(
|
||||
filename=filename,
|
||||
date="2022-12-16T17:04:16-05:00",
|
||||
page_number=None,
|
||||
url=None,
|
||||
sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
|
||||
sent_to=["Matthew Robinson <mrobinson@unstructured.io>"],
|
||||
subject="Test Email",
|
||||
filetype="message/rfc822",
|
||||
).to_dict()
|
||||
)
|
||||
|
||||
expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
|
||||
|
||||
@ -36,15 +36,18 @@ def test_partition_msg_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
||||
elements = partition_msg(filename=filename)
|
||||
assert elements == EXPECTED_MSG_OUTPUT
|
||||
assert elements[0].metadata == ElementMetadata(
|
||||
filename=filename,
|
||||
date="2022-12-16T17:04:16-05:00",
|
||||
page_number=None,
|
||||
url=None,
|
||||
sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
|
||||
sent_to=["Matthew Robinson (None)"],
|
||||
subject="Test Email",
|
||||
filetype="application/vnd.ms-outlook",
|
||||
assert (
|
||||
elements[0].metadata.to_dict()
|
||||
== ElementMetadata(
|
||||
filename=filename,
|
||||
date="2022-12-16T17:04:16-05:00",
|
||||
page_number=None,
|
||||
url=None,
|
||||
sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
|
||||
sent_to=["Matthew Robinson (None)"],
|
||||
subject="Test Email",
|
||||
filetype="application/vnd.ms-outlook",
|
||||
).to_dict()
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -145,3 +145,12 @@ the fox met a bear."""
|
||||
NarrativeText(text="The big brown fox was walking down the lane."),
|
||||
NarrativeText(text="At the end of the lane, the fox met a bear."),
|
||||
]
|
||||
|
||||
|
||||
def test_partition_text_extract_regex_metadata():
|
||||
text = "SPEAKER 1: It is my turn to speak now!"
|
||||
|
||||
elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}"})
|
||||
assert elements[0].metadata.regex_metadata == {
|
||||
"speaker": [{"text": "SPEAKER 1", "start": 0, "end": 9}],
|
||||
}
|
||||
|
||||
@ -2,11 +2,14 @@ from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import hashlib
|
||||
import inspect
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
from abc import ABC
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
||||
from functools import wraps
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict, Union, cast
|
||||
|
||||
|
||||
class NoID(ABC):
|
||||
@ -30,6 +33,14 @@ class DataSourceMetadata:
|
||||
return {key: value for key, value in self.__dict__.items() if value is not None}
|
||||
|
||||
|
||||
class RegexMetadata(TypedDict):
|
||||
"""Metadata that is extracted from a document element via regex."""
|
||||
|
||||
text: str
|
||||
start: int
|
||||
end: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class ElementMetadata:
|
||||
data_source: Optional[DataSourceMetadata] = None
|
||||
@ -58,6 +69,9 @@ class ElementMetadata:
|
||||
# Text format metadata fields
|
||||
text_as_html: Optional[str] = None
|
||||
|
||||
# Metadata extracted via regex
|
||||
regex_metadata: Optional[Dict[str, List[RegexMetadata]]] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if isinstance(self.filename, pathlib.Path):
|
||||
self.filename = str(self.filename)
|
||||
@ -68,10 +82,12 @@ class ElementMetadata:
|
||||
self.filename = filename
|
||||
|
||||
def to_dict(self):
|
||||
dict = {key: value for key, value in self.__dict__.items() if value is not None}
|
||||
_dict = {key: value for key, value in self.__dict__.items() if value is not None}
|
||||
if "regex_metadata" in _dict and not _dict["regex_metadata"]:
|
||||
_dict.pop("regex_metadata")
|
||||
if self.data_source:
|
||||
dict["data_source"] = cast(DataSourceMetadata, self.data_source).to_dict()
|
||||
return dict
|
||||
_dict["data_source"] = cast(DataSourceMetadata, self.data_source).to_dict()
|
||||
return _dict
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, input_dict):
|
||||
@ -91,6 +107,58 @@ class ElementMetadata:
|
||||
return dt
|
||||
|
||||
|
||||
def process_metadata():
|
||||
"""Decorator for processing metadata for document elements."""
|
||||
|
||||
def decorator(func: Callable):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
elements = func(*args, **kwargs)
|
||||
sig = inspect.signature(func)
|
||||
params = dict(**dict(zip(sig.parameters, args)), **kwargs)
|
||||
for param in sig.parameters.values():
|
||||
if param.name not in params and param.default is not param.empty:
|
||||
params[param.name] = param.default
|
||||
|
||||
regex_metadata: Dict["str", "str"] = params.get("regex_metadata", {})
|
||||
elements = _add_regex_metadata(elements, regex_metadata)
|
||||
|
||||
return elements
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def _add_regex_metadata(
|
||||
elements: List[Element],
|
||||
regex_metadata: Dict[str, str] = {},
|
||||
) -> List[Element]:
|
||||
"""Adds metadata based on a user provided regular expression.
|
||||
The additional metadata will be added to the regex_metadata
|
||||
attrbuted in the element metadata."""
|
||||
for element in elements:
|
||||
if isinstance(element, Text):
|
||||
_regex_metadata: Dict["str", List[RegexMetadata]] = {}
|
||||
for field_name, pattern in regex_metadata.items():
|
||||
results: List[RegexMetadata] = []
|
||||
for result in re.finditer(pattern, element.text):
|
||||
start, end = result.span()
|
||||
results.append(
|
||||
{
|
||||
"text": element.text[start:end],
|
||||
"start": start,
|
||||
"end": end,
|
||||
},
|
||||
)
|
||||
if len(results) > 0:
|
||||
_regex_metadata[field_name] = results
|
||||
|
||||
element.metadata.regex_metadata = _regex_metadata
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
class Element(ABC):
|
||||
"""An element is a section of a page in the document."""
|
||||
|
||||
|
||||
@ -4,17 +4,24 @@ from typing import IO, BinaryIO, List, Optional, Union, cast
|
||||
import lxml.html
|
||||
import pandas as pd
|
||||
|
||||
from unstructured.documents.elements import Element, ElementMetadata, Table
|
||||
from unstructured.documents.elements import (
|
||||
Element,
|
||||
ElementMetadata,
|
||||
Table,
|
||||
process_metadata,
|
||||
)
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.CSV)
|
||||
def partition_csv(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
include_metadata: bool = True,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions Microsoft Excel Documents in .csv format into its document elements.
|
||||
|
||||
|
||||
@ -2,17 +2,19 @@ import os
|
||||
import tempfile
|
||||
from typing import IO, List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import convert_office_doc, exactly_one
|
||||
from unstructured.partition.docx import partition_docx
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.DOC)
|
||||
def partition_doc(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
include_page_breaks: bool = True,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions Microsoft Word Documents in .doc format into its document elements.
|
||||
|
||||
|
||||
@ -22,6 +22,7 @@ from unstructured.documents.elements import (
|
||||
Table,
|
||||
Text,
|
||||
Title,
|
||||
process_metadata,
|
||||
)
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import (
|
||||
@ -102,12 +103,14 @@ def _get_paragraph_runs(paragraph):
|
||||
Paragraph.runs = property(lambda self: _get_paragraph_runs(self))
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.DOCX)
|
||||
def partition_docx(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
include_page_breaks: bool = True,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions Microsoft Word Documents in .docx format into its document elements.
|
||||
|
||||
|
||||
@ -29,6 +29,7 @@ from unstructured.documents.elements import (
|
||||
NarrativeText,
|
||||
Text,
|
||||
Title,
|
||||
process_metadata,
|
||||
)
|
||||
from unstructured.documents.email_elements import (
|
||||
MetaData,
|
||||
@ -182,6 +183,7 @@ def find_embedded_image(
|
||||
return Image(text=image_info[:-1]), element
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.EML)
|
||||
def partition_email(
|
||||
filename: Optional[str] = None,
|
||||
@ -190,6 +192,7 @@ def partition_email(
|
||||
content_source: str = "text/html",
|
||||
encoding: Optional[str] = None,
|
||||
include_headers: bool = False,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions an .eml documents into its constituent elements.
|
||||
Parameters
|
||||
|
||||
@ -1,15 +1,17 @@
|
||||
from typing import IO, List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.html import convert_and_partition_html
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.EPUB)
|
||||
def partition_epub(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
include_page_breaks: bool = False,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions an EPUB document. The document is first converted to HTML and then
|
||||
partitoned using partiton_html.
|
||||
|
||||
@ -2,7 +2,7 @@ from typing import IO, Dict, List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.documents.html import HTMLDocument
|
||||
from unstructured.documents.xml import VALID_PARSERS
|
||||
from unstructured.file_utils.encoding import read_txt_file
|
||||
@ -17,6 +17,7 @@ from unstructured.partition.common import (
|
||||
)
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.HTML)
|
||||
def partition_html(
|
||||
filename: Optional[str] = None,
|
||||
@ -29,6 +30,7 @@ def partition_html(
|
||||
headers: Dict[str, str] = {},
|
||||
ssl_verify: bool = True,
|
||||
parser: VALID_PARSERS = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions an HTML document into its constituent elements.
|
||||
|
||||
|
||||
@ -1,10 +1,11 @@
|
||||
from typing import List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.partition.pdf import partition_pdf_or_image
|
||||
|
||||
|
||||
@process_metadata()
|
||||
def partition_image(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
@ -14,6 +15,7 @@ def partition_image(
|
||||
include_page_breaks: bool = False,
|
||||
ocr_languages: str = "eng",
|
||||
strategy: str = "auto",
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Parses an image into a list of interpreted elements.
|
||||
|
||||
|
||||
@ -2,18 +2,20 @@ import json
|
||||
import re
|
||||
from typing import IO, List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.staging.base import dict_to_elements
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.JSON)
|
||||
def partition_json(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
text: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions an .json document into its constituent elements."""
|
||||
if text is not None and text.strip() == "" and not file and not filename:
|
||||
|
||||
@ -3,7 +3,7 @@ from typing import IO, List, Optional, Union
|
||||
import markdown
|
||||
import requests
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.documents.xml import VALID_PARSERS
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import exactly_one
|
||||
@ -16,6 +16,7 @@ def optional_decode(contents: Union[str, bytes]) -> str:
|
||||
return contents
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.MD)
|
||||
def partition_md(
|
||||
filename: Optional[str] = None,
|
||||
@ -25,6 +26,7 @@ def partition_md(
|
||||
include_page_breaks: bool = False,
|
||||
include_metadata: bool = True,
|
||||
parser: VALID_PARSERS = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
# Verify that only one of the arguments was provided
|
||||
if text is None:
|
||||
|
||||
@ -3,7 +3,7 @@ from typing import IO, Dict, List, Optional
|
||||
|
||||
import msg_parser
|
||||
|
||||
from unstructured.documents.elements import Element, ElementMetadata
|
||||
from unstructured.documents.elements import Element, ElementMetadata, process_metadata
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.partition.email import convert_to_iso_8601
|
||||
@ -11,10 +11,12 @@ from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.MSG)
|
||||
def partition_msg(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions a MSFT Outlook .msg file
|
||||
|
||||
|
||||
@ -1,12 +1,17 @@
|
||||
from typing import IO, List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.docx import convert_and_partition_docx
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.ODT)
|
||||
def partition_odt(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
|
||||
def partition_odt(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions Open Office Documents in .odt format into its document elements.
|
||||
|
||||
Parameters
|
||||
|
||||
@ -9,7 +9,12 @@ from pdfminer.utils import open_filename
|
||||
from PIL import Image
|
||||
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.elements import Element, ElementMetadata, PageBreak
|
||||
from unstructured.documents.elements import (
|
||||
Element,
|
||||
ElementMetadata,
|
||||
PageBreak,
|
||||
process_metadata,
|
||||
)
|
||||
from unstructured.file_utils.filetype import (
|
||||
FileType,
|
||||
add_metadata_with_filetype,
|
||||
@ -26,6 +31,7 @@ from unstructured.partition.text import element_from_text, partition_text
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.PDF)
|
||||
def partition_pdf(
|
||||
filename: str = "",
|
||||
@ -37,6 +43,7 @@ def partition_pdf(
|
||||
strategy: str = "auto",
|
||||
infer_table_structure: bool = False,
|
||||
ocr_languages: str = "eng",
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Parses a pdf document into a list of interpreted elements.
|
||||
Parameters
|
||||
|
||||
@ -2,17 +2,19 @@ import os
|
||||
import tempfile
|
||||
from typing import IO, List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import convert_office_doc, exactly_one
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.PPT)
|
||||
def partition_ppt(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
include_page_breaks: bool = False,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions Microsoft PowerPoint Documents in .ppt format into their document elements.
|
||||
|
||||
|
||||
@ -12,6 +12,7 @@ from unstructured.documents.elements import (
|
||||
Table,
|
||||
Text,
|
||||
Title,
|
||||
process_metadata,
|
||||
)
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import (
|
||||
@ -27,12 +28,14 @@ from unstructured.partition.text_type import (
|
||||
OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.PPTX)
|
||||
def partition_pptx(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
|
||||
include_page_breaks: bool = True,
|
||||
metadata_filename: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions Microsoft PowerPoint Documents in .pptx format into its document elements.
|
||||
|
||||
|
||||
@ -1,15 +1,17 @@
|
||||
from typing import IO, List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.html import convert_and_partition_html
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.RST)
|
||||
def partition_rst(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
include_page_breaks: bool = False,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions an RST document. The document is first converted to HTML and then
|
||||
partitioned using partition_html.
|
||||
|
||||
@ -1,15 +1,17 @@
|
||||
from typing import IO, List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.html import convert_and_partition_html
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.RTF)
|
||||
def partition_rtf(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
include_page_breaks: bool = False,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions an RTF document. The document is first converted to HTML and then
|
||||
partitioned using partiton_html.
|
||||
|
||||
@ -10,6 +10,7 @@ from unstructured.documents.elements import (
|
||||
NarrativeText,
|
||||
Text,
|
||||
Title,
|
||||
process_metadata,
|
||||
)
|
||||
from unstructured.file_utils.encoding import read_txt_file
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
@ -27,6 +28,7 @@ def split_by_paragraph(content: str) -> List[str]:
|
||||
return re.split(PARAGRAPH_PATTERN, content)
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.TXT)
|
||||
def partition_text(
|
||||
filename: Optional[str] = None,
|
||||
@ -36,6 +38,7 @@ def partition_text(
|
||||
paragraph_grouper: Optional[Callable[[str], str]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
include_metadata: bool = True,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions an .txt documents into its constituent elements.
|
||||
Parameters
|
||||
|
||||
@ -4,17 +4,24 @@ from typing import IO, BinaryIO, List, Optional, Union, cast
|
||||
import lxml.html
|
||||
import pandas as pd
|
||||
|
||||
from unstructured.documents.elements import Element, ElementMetadata, Table
|
||||
from unstructured.documents.elements import (
|
||||
Element,
|
||||
ElementMetadata,
|
||||
Table,
|
||||
process_metadata,
|
||||
)
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.TSV)
|
||||
def partition_tsv(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
include_metadata: bool = True,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions TSV files into document elements.
|
||||
|
||||
|
||||
@ -4,17 +4,24 @@ from typing import IO, BinaryIO, List, Optional, Union, cast
|
||||
import lxml.html
|
||||
import pandas as pd
|
||||
|
||||
from unstructured.documents.elements import Element, ElementMetadata, Table
|
||||
from unstructured.documents.elements import (
|
||||
Element,
|
||||
ElementMetadata,
|
||||
Table,
|
||||
process_metadata,
|
||||
)
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.XLSX)
|
||||
def partition_xlsx(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
include_metadata: bool = True,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions Microsoft Excel Documents in .xlsx format into its document elements.
|
||||
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
import xml.etree.ElementTree as ET
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import IO, BinaryIO, Optional, Union, cast
|
||||
from typing import IO, BinaryIO, List, Optional, Union, cast
|
||||
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.file_utils.encoding import read_txt_file
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
|
||||
@ -38,6 +39,7 @@ def get_leaf_elements(
|
||||
return "\n".join(leaf_elements) # type: ignore
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.XML)
|
||||
def partition_xml(
|
||||
filename: Optional[str] = None,
|
||||
@ -47,7 +49,8 @@ def partition_xml(
|
||||
metadata_filename: Optional[str] = None,
|
||||
include_metadata: bool = True,
|
||||
encoding: Optional[str] = None,
|
||||
):
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions an XML document into its document elements.
|
||||
|
||||
Parameters
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user