feat: add ability to extract extra metadata with regex (#763)

* first pass on regex metadata

* fix typing for regex metadata

* add dataclass back in

* add decorators

* fix tests

* update docs

* add tests for regex metadata

* add process metadata to tsv

* changelog and version

* docs typos

* consolidate to using a single kwarg

* fix test
This commit is contained in:
Matt Robinson 2023-06-16 10:10:56 -04:00 committed by GitHub
parent ec403e245c
commit 4ea716837d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 281 additions and 41 deletions

View File

@ -8,6 +8,7 @@
### Features
* Provides users with the ability to extract additional metadata via regex.
* Updates `partition_docx` to include headers and footers in the output.
* Create `partition_tsv` and associated tests. Make additional changes to `detect_filetype`.

View File

@ -15,7 +15,10 @@ Library Documentation
Check out this section to learn about basic workflows in ``unstructured``.
:doc:`bricks`
Learning more about partitioning, cleaning, and staging bricks, included advanced usage patterns.
Learn more about partitioning, cleaning, and staging bricks, including advanced usage patterns.
:doc:`metadata`
Learn more about how metadata is tracked in the ``unstructured`` library.
:doc:`examples`
Examples of other types of workflows within the ``unstructured`` package.
@ -33,5 +36,6 @@ Library Documentation
installing
getting_started
bricks
metadata
examples
integrations

84
docs/source/metadata.rst Normal file
View File

@ -0,0 +1,84 @@
Metadata
========
The ``unstructured`` package tracks a variety of metadata about Elements extracted from documents.
Tracking metadata enables users to filter document elements downstream based on element metadata of interest.
For example, a user may be interested in selected document elements from a given page number
or an e-mail with a given subject line.
Metadata is tracked at the element level. You can extract the metadata for a given document element
with ``element.metadata``. For a dictionary representation, use ``element.metadata.to_dict()``.
All document types return the following metadata fields when the information is available from
the source file:
* ``filename``
* ``file_directory``
* ``date``
* ``filetype``
* ``page_number``
Email
-----
Emails will include ``sent_from``, ``sent_to``, and ``subject`` metadata.
``sent_from`` is a list of strings because the `RFC 822 <https://www.rfc-editor.org/rfc/rfc822>`_
spec for emails allows for multiple sent from email addresses.
Microsoft Excel Documents
--------------------------
For Excel documents, ``ElementMetadata`` will contain a ``page_name`` element, which corresponds
to the sheet name in the Excel document.
Microsoft Word Documents
-------------------------
Headers and footers in Word documents include a ``header_footer_type`` indicating which page
a header or footer applies to. Valid values are ``"primary"``, ``"even_only"``, and ``"first_page"``.
Webpages
---------
Elements from webpages will include a ``url`` metadata field, corresponding to the URL for the webpage.
##########################
Advanced Metadata Options
###########################
Extract Metadata with Regexes
------------------------------
``unstructured`` allows users to extract additional metadata with regexes using the ``regex_metadata`` kwarg.
Here is an example of how to extract regex metadata:
.. code:: python
from unstructured.partition.text import partition_text
text = "SPEAKER 1: It is my turn to speak now!"
elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}:"})
elements[0].metadata.regex_metadata
The result will look like:
.. code:: python
{'speaker':
[
{
'text': 'SPEAKER 1:',
'start': 0,
'end': 10,
}
]
}

View File

@ -206,15 +206,18 @@ def test_partition_email_has_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
elements = partition_email(filename=filename)
assert len(elements) > 0
assert elements[0].metadata == ElementMetadata(
filename=filename,
date="2022-12-16T17:04:16-05:00",
page_number=None,
url=None,
sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
sent_to=["Matthew Robinson <mrobinson@unstructured.io>"],
subject="Test Email",
filetype="message/rfc822",
assert (
elements[0].metadata.to_dict()
== ElementMetadata(
filename=filename,
date="2022-12-16T17:04:16-05:00",
page_number=None,
url=None,
sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
sent_to=["Matthew Robinson <mrobinson@unstructured.io>"],
subject="Test Email",
filetype="message/rfc822",
).to_dict()
)
expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")

View File

@ -36,15 +36,18 @@ def test_partition_msg_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
elements = partition_msg(filename=filename)
assert elements == EXPECTED_MSG_OUTPUT
assert elements[0].metadata == ElementMetadata(
filename=filename,
date="2022-12-16T17:04:16-05:00",
page_number=None,
url=None,
sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
sent_to=["Matthew Robinson (None)"],
subject="Test Email",
filetype="application/vnd.ms-outlook",
assert (
elements[0].metadata.to_dict()
== ElementMetadata(
filename=filename,
date="2022-12-16T17:04:16-05:00",
page_number=None,
url=None,
sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
sent_to=["Matthew Robinson (None)"],
subject="Test Email",
filetype="application/vnd.ms-outlook",
).to_dict()
)

View File

@ -145,3 +145,12 @@ the fox met a bear."""
NarrativeText(text="The big brown fox was walking down the lane."),
NarrativeText(text="At the end of the lane, the fox met a bear."),
]
def test_partition_text_extract_regex_metadata():
text = "SPEAKER 1: It is my turn to speak now!"
elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}"})
assert elements[0].metadata.regex_metadata == {
"speaker": [{"text": "SPEAKER 1", "start": 0, "end": 9}],
}

View File

@ -2,11 +2,14 @@ from __future__ import annotations
import datetime
import hashlib
import inspect
import os
import pathlib
import re
from abc import ABC
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
from functools import wraps
from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict, Union, cast
class NoID(ABC):
@ -30,6 +33,14 @@ class DataSourceMetadata:
return {key: value for key, value in self.__dict__.items() if value is not None}
class RegexMetadata(TypedDict):
"""Metadata that is extracted from a document element via regex."""
text: str
start: int
end: int
@dataclass
class ElementMetadata:
data_source: Optional[DataSourceMetadata] = None
@ -58,6 +69,9 @@ class ElementMetadata:
# Text format metadata fields
text_as_html: Optional[str] = None
# Metadata extracted via regex
regex_metadata: Optional[Dict[str, List[RegexMetadata]]] = None
def __post_init__(self):
if isinstance(self.filename, pathlib.Path):
self.filename = str(self.filename)
@ -68,10 +82,12 @@ class ElementMetadata:
self.filename = filename
def to_dict(self):
dict = {key: value for key, value in self.__dict__.items() if value is not None}
_dict = {key: value for key, value in self.__dict__.items() if value is not None}
if "regex_metadata" in _dict and not _dict["regex_metadata"]:
_dict.pop("regex_metadata")
if self.data_source:
dict["data_source"] = cast(DataSourceMetadata, self.data_source).to_dict()
return dict
_dict["data_source"] = cast(DataSourceMetadata, self.data_source).to_dict()
return _dict
@classmethod
def from_dict(cls, input_dict):
@ -91,6 +107,58 @@ class ElementMetadata:
return dt
def process_metadata():
"""Decorator for processing metadata for document elements."""
def decorator(func: Callable):
@wraps(func)
def wrapper(*args, **kwargs):
elements = func(*args, **kwargs)
sig = inspect.signature(func)
params = dict(**dict(zip(sig.parameters, args)), **kwargs)
for param in sig.parameters.values():
if param.name not in params and param.default is not param.empty:
params[param.name] = param.default
regex_metadata: Dict["str", "str"] = params.get("regex_metadata", {})
elements = _add_regex_metadata(elements, regex_metadata)
return elements
return wrapper
return decorator
def _add_regex_metadata(
elements: List[Element],
regex_metadata: Dict[str, str] = {},
) -> List[Element]:
"""Adds metadata based on a user provided regular expression.
The additional metadata will be added to the regex_metadata
attrbuted in the element metadata."""
for element in elements:
if isinstance(element, Text):
_regex_metadata: Dict["str", List[RegexMetadata]] = {}
for field_name, pattern in regex_metadata.items():
results: List[RegexMetadata] = []
for result in re.finditer(pattern, element.text):
start, end = result.span()
results.append(
{
"text": element.text[start:end],
"start": start,
"end": end,
},
)
if len(results) > 0:
_regex_metadata[field_name] = results
element.metadata.regex_metadata = _regex_metadata
return elements
class Element(ABC):
"""An element is a section of a page in the document."""

View File

@ -4,17 +4,24 @@ from typing import IO, BinaryIO, List, Optional, Union, cast
import lxml.html
import pandas as pd
from unstructured.documents.elements import Element, ElementMetadata, Table
from unstructured.documents.elements import (
Element,
ElementMetadata,
Table,
process_metadata,
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
@process_metadata()
@add_metadata_with_filetype(FileType.CSV)
def partition_csv(
filename: Optional[str] = None,
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
**kwargs,
) -> List[Element]:
"""Partitions Microsoft Excel Documents in .csv format into its document elements.

View File

@ -2,17 +2,19 @@ import os
import tempfile
from typing import IO, List, Optional
from unstructured.documents.elements import Element
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import convert_office_doc, exactly_one
from unstructured.partition.docx import partition_docx
@process_metadata()
@add_metadata_with_filetype(FileType.DOC)
def partition_doc(
filename: Optional[str] = None,
file: Optional[IO] = None,
include_page_breaks: bool = True,
**kwargs,
) -> List[Element]:
"""Partitions Microsoft Word Documents in .doc format into its document elements.

View File

@ -22,6 +22,7 @@ from unstructured.documents.elements import (
Table,
Text,
Title,
process_metadata,
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import (
@ -102,12 +103,14 @@ def _get_paragraph_runs(paragraph):
Paragraph.runs = property(lambda self: _get_paragraph_runs(self))
@process_metadata()
@add_metadata_with_filetype(FileType.DOCX)
def partition_docx(
filename: Optional[str] = None,
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
metadata_filename: Optional[str] = None,
include_page_breaks: bool = True,
**kwargs,
) -> List[Element]:
"""Partitions Microsoft Word Documents in .docx format into its document elements.

View File

@ -29,6 +29,7 @@ from unstructured.documents.elements import (
NarrativeText,
Text,
Title,
process_metadata,
)
from unstructured.documents.email_elements import (
MetaData,
@ -182,6 +183,7 @@ def find_embedded_image(
return Image(text=image_info[:-1]), element
@process_metadata()
@add_metadata_with_filetype(FileType.EML)
def partition_email(
filename: Optional[str] = None,
@ -190,6 +192,7 @@ def partition_email(
content_source: str = "text/html",
encoding: Optional[str] = None,
include_headers: bool = False,
**kwargs,
) -> List[Element]:
"""Partitions an .eml documents into its constituent elements.
Parameters

View File

@ -1,15 +1,17 @@
from typing import IO, List, Optional
from unstructured.documents.elements import Element
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.html import convert_and_partition_html
@process_metadata()
@add_metadata_with_filetype(FileType.EPUB)
def partition_epub(
filename: Optional[str] = None,
file: Optional[IO] = None,
include_page_breaks: bool = False,
**kwargs,
) -> List[Element]:
"""Partitions an EPUB document. The document is first converted to HTML and then
partitoned using partiton_html.

View File

@ -2,7 +2,7 @@ from typing import IO, Dict, List, Optional
import requests
from unstructured.documents.elements import Element
from unstructured.documents.elements import Element, process_metadata
from unstructured.documents.html import HTMLDocument
from unstructured.documents.xml import VALID_PARSERS
from unstructured.file_utils.encoding import read_txt_file
@ -17,6 +17,7 @@ from unstructured.partition.common import (
)
@process_metadata()
@add_metadata_with_filetype(FileType.HTML)
def partition_html(
filename: Optional[str] = None,
@ -29,6 +30,7 @@ def partition_html(
headers: Dict[str, str] = {},
ssl_verify: bool = True,
parser: VALID_PARSERS = None,
**kwargs,
) -> List[Element]:
"""Partitions an HTML document into its constituent elements.

View File

@ -1,10 +1,11 @@
from typing import List, Optional
from unstructured.documents.elements import Element
from unstructured.documents.elements import Element, process_metadata
from unstructured.partition.common import exactly_one
from unstructured.partition.pdf import partition_pdf_or_image
@process_metadata()
def partition_image(
filename: str = "",
file: Optional[bytes] = None,
@ -14,6 +15,7 @@ def partition_image(
include_page_breaks: bool = False,
ocr_languages: str = "eng",
strategy: str = "auto",
**kwargs,
) -> List[Element]:
"""Parses an image into a list of interpreted elements.

View File

@ -2,18 +2,20 @@ import json
import re
from typing import IO, List, Optional
from unstructured.documents.elements import Element
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
from unstructured.partition.common import exactly_one
from unstructured.staging.base import dict_to_elements
@process_metadata()
@add_metadata_with_filetype(FileType.JSON)
def partition_json(
filename: Optional[str] = None,
file: Optional[IO] = None,
text: Optional[str] = None,
**kwargs,
) -> List[Element]:
"""Partitions an .json document into its constituent elements."""
if text is not None and text.strip() == "" and not file and not filename:

View File

@ -3,7 +3,7 @@ from typing import IO, List, Optional, Union
import markdown
import requests
from unstructured.documents.elements import Element
from unstructured.documents.elements import Element, process_metadata
from unstructured.documents.xml import VALID_PARSERS
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import exactly_one
@ -16,6 +16,7 @@ def optional_decode(contents: Union[str, bytes]) -> str:
return contents
@process_metadata()
@add_metadata_with_filetype(FileType.MD)
def partition_md(
filename: Optional[str] = None,
@ -25,6 +26,7 @@ def partition_md(
include_page_breaks: bool = False,
include_metadata: bool = True,
parser: VALID_PARSERS = None,
**kwargs,
) -> List[Element]:
# Verify that only one of the arguments was provided
if text is None:

View File

@ -3,7 +3,7 @@ from typing import IO, Dict, List, Optional
import msg_parser
from unstructured.documents.elements import Element, ElementMetadata
from unstructured.documents.elements import Element, ElementMetadata, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import exactly_one
from unstructured.partition.email import convert_to_iso_8601
@ -11,10 +11,12 @@ from unstructured.partition.html import partition_html
from unstructured.partition.text import partition_text
@process_metadata()
@add_metadata_with_filetype(FileType.MSG)
def partition_msg(
filename: Optional[str] = None,
file: Optional[IO] = None,
**kwargs,
) -> List[Element]:
"""Partitions a MSFT Outlook .msg file

View File

@ -1,12 +1,17 @@
from typing import IO, List, Optional
from unstructured.documents.elements import Element
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.docx import convert_and_partition_docx
@process_metadata()
@add_metadata_with_filetype(FileType.ODT)
def partition_odt(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
def partition_odt(
filename: Optional[str] = None,
file: Optional[IO] = None,
**kwargs,
) -> List[Element]:
"""Partitions Open Office Documents in .odt format into its document elements.
Parameters

View File

@ -9,7 +9,12 @@ from pdfminer.utils import open_filename
from PIL import Image
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Element, ElementMetadata, PageBreak
from unstructured.documents.elements import (
Element,
ElementMetadata,
PageBreak,
process_metadata,
)
from unstructured.file_utils.filetype import (
FileType,
add_metadata_with_filetype,
@ -26,6 +31,7 @@ from unstructured.partition.text import element_from_text, partition_text
from unstructured.utils import requires_dependencies
@process_metadata()
@add_metadata_with_filetype(FileType.PDF)
def partition_pdf(
filename: str = "",
@ -37,6 +43,7 @@ def partition_pdf(
strategy: str = "auto",
infer_table_structure: bool = False,
ocr_languages: str = "eng",
**kwargs,
) -> List[Element]:
"""Parses a pdf document into a list of interpreted elements.
Parameters

View File

@ -2,17 +2,19 @@ import os
import tempfile
from typing import IO, List, Optional
from unstructured.documents.elements import Element
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import convert_office_doc, exactly_one
from unstructured.partition.pptx import partition_pptx
@process_metadata()
@add_metadata_with_filetype(FileType.PPT)
def partition_ppt(
filename: Optional[str] = None,
file: Optional[IO] = None,
include_page_breaks: bool = False,
**kwargs,
) -> List[Element]:
"""Partitions Microsoft PowerPoint Documents in .ppt format into their document elements.

View File

@ -12,6 +12,7 @@ from unstructured.documents.elements import (
Table,
Text,
Title,
process_metadata,
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import (
@ -27,12 +28,14 @@ from unstructured.partition.text_type import (
OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
@process_metadata()
@add_metadata_with_filetype(FileType.PPTX)
def partition_pptx(
filename: Optional[str] = None,
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
include_page_breaks: bool = True,
metadata_filename: Optional[str] = None,
**kwargs,
) -> List[Element]:
"""Partitions Microsoft PowerPoint Documents in .pptx format into its document elements.

View File

@ -1,15 +1,17 @@
from typing import IO, List, Optional
from unstructured.documents.elements import Element
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.html import convert_and_partition_html
@process_metadata()
@add_metadata_with_filetype(FileType.RST)
def partition_rst(
filename: Optional[str] = None,
file: Optional[IO] = None,
include_page_breaks: bool = False,
**kwargs,
) -> List[Element]:
"""Partitions an RST document. The document is first converted to HTML and then
partitioned using partition_html.

View File

@ -1,15 +1,17 @@
from typing import IO, List, Optional
from unstructured.documents.elements import Element
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.html import convert_and_partition_html
@process_metadata()
@add_metadata_with_filetype(FileType.RTF)
def partition_rtf(
filename: Optional[str] = None,
file: Optional[IO] = None,
include_page_breaks: bool = False,
**kwargs,
) -> List[Element]:
"""Partitions an RTF document. The document is first converted to HTML and then
partitioned using partiton_html.

View File

@ -10,6 +10,7 @@ from unstructured.documents.elements import (
NarrativeText,
Text,
Title,
process_metadata,
)
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
@ -27,6 +28,7 @@ def split_by_paragraph(content: str) -> List[str]:
return re.split(PARAGRAPH_PATTERN, content)
@process_metadata()
@add_metadata_with_filetype(FileType.TXT)
def partition_text(
filename: Optional[str] = None,
@ -36,6 +38,7 @@ def partition_text(
paragraph_grouper: Optional[Callable[[str], str]] = None,
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
**kwargs,
) -> List[Element]:
"""Partitions an .txt documents into its constituent elements.
Parameters

View File

@ -4,17 +4,24 @@ from typing import IO, BinaryIO, List, Optional, Union, cast
import lxml.html
import pandas as pd
from unstructured.documents.elements import Element, ElementMetadata, Table
from unstructured.documents.elements import (
Element,
ElementMetadata,
Table,
process_metadata,
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
@process_metadata()
@add_metadata_with_filetype(FileType.TSV)
def partition_tsv(
filename: Optional[str] = None,
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
**kwargs,
) -> List[Element]:
"""Partitions TSV files into document elements.

View File

@ -4,17 +4,24 @@ from typing import IO, BinaryIO, List, Optional, Union, cast
import lxml.html
import pandas as pd
from unstructured.documents.elements import Element, ElementMetadata, Table
from unstructured.documents.elements import (
Element,
ElementMetadata,
Table,
process_metadata,
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
@process_metadata()
@add_metadata_with_filetype(FileType.XLSX)
def partition_xlsx(
filename: Optional[str] = None,
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
**kwargs,
) -> List[Element]:
"""Partitions Microsoft Excel Documents in .xlsx format into its document elements.

View File

@ -1,7 +1,8 @@
import xml.etree.ElementTree as ET
from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, Optional, Union, cast
from typing import IO, BinaryIO, List, Optional, Union, cast
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
@ -38,6 +39,7 @@ def get_leaf_elements(
return "\n".join(leaf_elements) # type: ignore
@process_metadata()
@add_metadata_with_filetype(FileType.XML)
def partition_xml(
filename: Optional[str] = None,
@ -47,7 +49,8 @@ def partition_xml(
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
encoding: Optional[str] = None,
):
**kwargs,
) -> List[Element]:
"""Partitions an XML document into its document elements.
Parameters