feat: add ability to extract extra metadata with regex (#763)

* first pass on regex metadata * fix typing for regex metadata * add dataclass back in * add decorators * fix tests * update docs * add tests for regex metadata * add process metadata to tsv * changelog and version * docs typos * consolidate to using a single kwarg * fix test
2025-12-25 14:14:30 +00:00 · 2023-06-16 10:10:56 -04:00 · 2023-06-16 10:10:56 -04:00 · 4ea716837d
commit 4ea716837d
parent ec403e245c
27 changed files with 281 additions and 41 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -8,6 +8,7 @@

 ### Features

+* Provides users with the ability to extract additional metadata via regex.
 * Updates `partition_docx` to include headers and footers in the output.
 * Create `partition_tsv` and associated tests. Make additional changes to `detect_filetype`.

--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -15,7 +15,10 @@ Library Documentation
  Check out this section to learn about basic workflows in ``unstructured``.

 :doc:`bricks`
-  Learning more about partitioning, cleaning, and staging bricks, included advanced usage patterns.
+  Learn more about partitioning, cleaning, and staging bricks, including advanced usage patterns.
+
+:doc:`metadata`
+  Learn more about how metadata is tracked in the ``unstructured`` library.

 :doc:`examples`
  Examples of other types of workflows within the ``unstructured`` package.
@ -33,5 +36,6 @@ Library Documentation
   installing
   getting_started
   bricks
+   metadata
   examples
   integrations
--- a/docs/source/metadata.rst
+++ b/docs/source/metadata.rst
@ -0,0 +1,84 @@
+Metadata
+========
+
+The ``unstructured`` package tracks a variety of metadata about Elements extracted from documents.
+Tracking metadata enables users to filter document elements downstream based on element metadata of interest.
+For example, a user may be interested in selected document elements from a given page number
+or an e-mail with a given subject line.
+
+Metadata is tracked at the element level. You can extract the metadata for a given document element
+with ``element.metadata``. For a dictionary representation, use ``element.metadata.to_dict()``.
+All document types return the following metadata fields when the information is available from
+the source file:
+
+* ``filename``
+* ``file_directory``
+* ``date``
+* ``filetype``
+* ``page_number``
+
+
+Email
+-----
+
+Emails will include ``sent_from``, ``sent_to``, and ``subject`` metadata.
+``sent_from`` is a list of strings because the `RFC 822 <https://www.rfc-editor.org/rfc/rfc822>`_
+spec for emails allows for multiple sent from email addresses.
+
+
+Microsoft Excel Documents
+--------------------------
+
+For Excel documents, ``ElementMetadata`` will contain a ``page_name`` element, which corresponds
+to the sheet name in the Excel document.
+
+
+Microsoft Word Documents
+-------------------------
+
+Headers and footers in Word documents include a ``header_footer_type`` indicating which page
+a header or footer applies to. Valid values are ``"primary"``, ``"even_only"``, and ``"first_page"``.
+
+
+Webpages
+---------
+
+Elements from webpages will include a ``url`` metadata field, corresponding to the URL for the webpage.
+
+
+
+##########################
+Advanced Metadata Options
+###########################
+
+
+
+Extract Metadata with Regexes
+------------------------------
+
+``unstructured`` allows users to extract additional metadata with regexes using the ``regex_metadata`` kwarg.
+Here is an example of how to extract regex metadata:
+
+
+.. code:: python
+
+  from unstructured.partition.text import partition_text
+
+  text = "SPEAKER 1: It is my turn to speak now!"
+  elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}:"})
+  elements[0].metadata.regex_metadata
+
+The result will look like:
+
+
+.. code:: python
+
+  {'speaker':
+    [
+      {
+        'text': 'SPEAKER 1:',
+        'start': 0,
+        'end': 10,
+     }
+    ]
+  }
--- a/test_unstructured/partition/test_email.py
+++ b/test_unstructured/partition/test_email.py
@ -206,15 +206,18 @@ def test_partition_email_has_metadata():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
    elements = partition_email(filename=filename)
    assert len(elements) > 0
-    assert elements[0].metadata == ElementMetadata(
-        filename=filename,
-        date="2022-12-16T17:04:16-05:00",
-        page_number=None,
-        url=None,
-        sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
-        sent_to=["Matthew Robinson <mrobinson@unstructured.io>"],
-        subject="Test Email",
-        filetype="message/rfc822",
+    assert (
+        elements[0].metadata.to_dict()
+        == ElementMetadata(
+            filename=filename,
+            date="2022-12-16T17:04:16-05:00",
+            page_number=None,
+            url=None,
+            sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
+            sent_to=["Matthew Robinson <mrobinson@unstructured.io>"],
+            subject="Test Email",
+            filetype="message/rfc822",
+        ).to_dict()
    )

    expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
--- a/test_unstructured/partition/test_msg.py
+++ b/test_unstructured/partition/test_msg.py
@ -36,15 +36,18 @@ def test_partition_msg_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
    elements = partition_msg(filename=filename)
    assert elements == EXPECTED_MSG_OUTPUT
-    assert elements[0].metadata == ElementMetadata(
-        filename=filename,
-        date="2022-12-16T17:04:16-05:00",
-        page_number=None,
-        url=None,
-        sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
-        sent_to=["Matthew Robinson (None)"],
-        subject="Test Email",
-        filetype="application/vnd.ms-outlook",
+    assert (
+        elements[0].metadata.to_dict()
+        == ElementMetadata(
+            filename=filename,
+            date="2022-12-16T17:04:16-05:00",
+            page_number=None,
+            url=None,
+            sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
+            sent_to=["Matthew Robinson (None)"],
+            subject="Test Email",
+            filetype="application/vnd.ms-outlook",
+        ).to_dict()
    )


--- a/test_unstructured/partition/test_text.py
+++ b/test_unstructured/partition/test_text.py
@ -145,3 +145,12 @@ the fox met a bear."""
        NarrativeText(text="The big brown fox was walking down the lane."),
        NarrativeText(text="At the end of the lane, the fox met a bear."),
    ]
+
+
+def test_partition_text_extract_regex_metadata():
+    text = "SPEAKER 1: It is my turn to speak now!"
+
+    elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}"})
+    assert elements[0].metadata.regex_metadata == {
+        "speaker": [{"text": "SPEAKER 1", "start": 0, "end": 9}],
+    }
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@ -2,11 +2,14 @@ from __future__ import annotations

 import datetime
 import hashlib
+import inspect
 import os
 import pathlib
+import re
 from abc import ABC
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
+from functools import wraps
+from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict, Union, cast


 class NoID(ABC):
@ -30,6 +33,14 @@ class DataSourceMetadata:
        return {key: value for key, value in self.__dict__.items() if value is not None}


+class RegexMetadata(TypedDict):
+    """Metadata that is extracted from a document element via regex."""
+
+    text: str
+    start: int
+    end: int
+
+
@dataclass
 class ElementMetadata:
    data_source: Optional[DataSourceMetadata] = None
@ -58,6 +69,9 @@ class ElementMetadata:
    # Text format metadata fields
    text_as_html: Optional[str] = None

+    # Metadata extracted via regex
+    regex_metadata: Optional[Dict[str, List[RegexMetadata]]] = None
+
    def __post_init__(self):
        if isinstance(self.filename, pathlib.Path):
            self.filename = str(self.filename)
@ -68,10 +82,12 @@ class ElementMetadata:
            self.filename = filename

    def to_dict(self):
-        dict = {key: value for key, value in self.__dict__.items() if value is not None}
+        _dict = {key: value for key, value in self.__dict__.items() if value is not None}
+        if "regex_metadata" in _dict and not _dict["regex_metadata"]:
+            _dict.pop("regex_metadata")
        if self.data_source:
-            dict["data_source"] = cast(DataSourceMetadata, self.data_source).to_dict()
-        return dict
+            _dict["data_source"] = cast(DataSourceMetadata, self.data_source).to_dict()
+        return _dict

    @classmethod
    def from_dict(cls, input_dict):
@ -91,6 +107,58 @@ class ElementMetadata:
        return dt


+def process_metadata():
+    """Decorator for processing metadata for document elements."""
+
+    def decorator(func: Callable):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            elements = func(*args, **kwargs)
+            sig = inspect.signature(func)
+            params = dict(**dict(zip(sig.parameters, args)), **kwargs)
+            for param in sig.parameters.values():
+                if param.name not in params and param.default is not param.empty:
+                    params[param.name] = param.default
+
+            regex_metadata: Dict["str", "str"] = params.get("regex_metadata", {})
+            elements = _add_regex_metadata(elements, regex_metadata)
+
+            return elements
+
+        return wrapper
+
+    return decorator
+
+
+def _add_regex_metadata(
+    elements: List[Element],
+    regex_metadata: Dict[str, str] = {},
+) -> List[Element]:
+    """Adds metadata based on a user provided regular expression.
+    The additional metadata will be added to the regex_metadata
+    attrbuted in the element metadata."""
+    for element in elements:
+        if isinstance(element, Text):
+            _regex_metadata: Dict["str", List[RegexMetadata]] = {}
+            for field_name, pattern in regex_metadata.items():
+                results: List[RegexMetadata] = []
+                for result in re.finditer(pattern, element.text):
+                    start, end = result.span()
+                    results.append(
+                        {
+                            "text": element.text[start:end],
+                            "start": start,
+                            "end": end,
+                        },
+                    )
+                if len(results) > 0:
+                    _regex_metadata[field_name] = results
+
+            element.metadata.regex_metadata = _regex_metadata
+
+    return elements
+
+
 class Element(ABC):
    """An element is a section of a page in the document."""

--- a/unstructured/partition/csv.py
+++ b/unstructured/partition/csv.py
@ -4,17 +4,24 @@ from typing import IO, BinaryIO, List, Optional, Union, cast
 import lxml.html
 import pandas as pd

-from unstructured.documents.elements import Element, ElementMetadata, Table
+from unstructured.documents.elements import (
+    Element,
+    ElementMetadata,
+    Table,
+    process_metadata,
+)
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed


+@process_metadata()
@add_metadata_with_filetype(FileType.CSV)
 def partition_csv(
    filename: Optional[str] = None,
    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
    metadata_filename: Optional[str] = None,
    include_metadata: bool = True,
+    **kwargs,
 ) -> List[Element]:
    """Partitions Microsoft Excel Documents in .csv format into its document elements.

--- a/unstructured/partition/doc.py
+++ b/unstructured/partition/doc.py
@ -2,17 +2,19 @@ import os
 import tempfile
 from typing import IO, List, Optional

-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import convert_office_doc, exactly_one
 from unstructured.partition.docx import partition_docx


+@process_metadata()
@add_metadata_with_filetype(FileType.DOC)
 def partition_doc(
    filename: Optional[str] = None,
    file: Optional[IO] = None,
    include_page_breaks: bool = True,
+    **kwargs,
 ) -> List[Element]:
    """Partitions Microsoft Word Documents in .doc format into its document elements.

--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -22,6 +22,7 @@ from unstructured.documents.elements import (
    Table,
    Text,
    Title,
+    process_metadata,
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import (
@ -102,12 +103,14 @@ def _get_paragraph_runs(paragraph):
 Paragraph.runs = property(lambda self: _get_paragraph_runs(self))


+@process_metadata()
@add_metadata_with_filetype(FileType.DOCX)
 def partition_docx(
    filename: Optional[str] = None,
    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
    metadata_filename: Optional[str] = None,
    include_page_breaks: bool = True,
+    **kwargs,
 ) -> List[Element]:
    """Partitions Microsoft Word Documents in .docx format into its document elements.

--- a/unstructured/partition/email.py
+++ b/unstructured/partition/email.py
@ -29,6 +29,7 @@ from unstructured.documents.elements import (
    NarrativeText,
    Text,
    Title,
+    process_metadata,
 )
 from unstructured.documents.email_elements import (
    MetaData,
@ -182,6 +183,7 @@ def find_embedded_image(
    return Image(text=image_info[:-1]), element


+@process_metadata()
@add_metadata_with_filetype(FileType.EML)
 def partition_email(
    filename: Optional[str] = None,
@ -190,6 +192,7 @@ def partition_email(
    content_source: str = "text/html",
    encoding: Optional[str] = None,
    include_headers: bool = False,
+    **kwargs,
 ) -> List[Element]:
    """Partitions an .eml documents into its constituent elements.
    Parameters
--- a/unstructured/partition/epub.py
+++ b/unstructured/partition/epub.py
@ -1,15 +1,17 @@
 from typing import IO, List, Optional

-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.html import convert_and_partition_html


+@process_metadata()
@add_metadata_with_filetype(FileType.EPUB)
 def partition_epub(
    filename: Optional[str] = None,
    file: Optional[IO] = None,
    include_page_breaks: bool = False,
+    **kwargs,
 ) -> List[Element]:
    """Partitions an EPUB document. The document is first converted to HTML and then
    partitoned using partiton_html.
--- a/unstructured/partition/html.py
+++ b/unstructured/partition/html.py
@ -2,7 +2,7 @@ from typing import IO, Dict, List, Optional

 import requests

-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.documents.html import HTMLDocument
 from unstructured.documents.xml import VALID_PARSERS
 from unstructured.file_utils.encoding import read_txt_file
@ -17,6 +17,7 @@ from unstructured.partition.common import (
 )


+@process_metadata()
@add_metadata_with_filetype(FileType.HTML)
 def partition_html(
    filename: Optional[str] = None,
@ -29,6 +30,7 @@ def partition_html(
    headers: Dict[str, str] = {},
    ssl_verify: bool = True,
    parser: VALID_PARSERS = None,
+    **kwargs,
 ) -> List[Element]:
    """Partitions an HTML document into its constituent elements.

--- a/unstructured/partition/image.py
+++ b/unstructured/partition/image.py
@ -1,10 +1,11 @@
 from typing import List, Optional

-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.partition.common import exactly_one
 from unstructured.partition.pdf import partition_pdf_or_image


+@process_metadata()
 def partition_image(
    filename: str = "",
    file: Optional[bytes] = None,
@ -14,6 +15,7 @@ def partition_image(
    include_page_breaks: bool = False,
    ocr_languages: str = "eng",
    strategy: str = "auto",
+    **kwargs,
 ) -> List[Element]:
    """Parses an image into a list of interpreted elements.

--- a/unstructured/partition/json.py
+++ b/unstructured/partition/json.py
@ -2,18 +2,20 @@ import json
 import re
 from typing import IO, List, Optional

-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
 from unstructured.partition.common import exactly_one
 from unstructured.staging.base import dict_to_elements


+@process_metadata()
@add_metadata_with_filetype(FileType.JSON)
 def partition_json(
    filename: Optional[str] = None,
    file: Optional[IO] = None,
    text: Optional[str] = None,
+    **kwargs,
 ) -> List[Element]:
    """Partitions an .json document into its constituent elements."""
    if text is not None and text.strip() == "" and not file and not filename:
--- a/unstructured/partition/md.py
+++ b/unstructured/partition/md.py
@ -3,7 +3,7 @@ from typing import IO, List, Optional, Union
 import markdown
 import requests

-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.documents.xml import VALID_PARSERS
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import exactly_one
@ -16,6 +16,7 @@ def optional_decode(contents: Union[str, bytes]) -> str:
    return contents


+@process_metadata()
@add_metadata_with_filetype(FileType.MD)
 def partition_md(
    filename: Optional[str] = None,
@ -25,6 +26,7 @@ def partition_md(
    include_page_breaks: bool = False,
    include_metadata: bool = True,
    parser: VALID_PARSERS = None,
+    **kwargs,
 ) -> List[Element]:
    # Verify that only one of the arguments was provided
    if text is None:
--- a/unstructured/partition/msg.py
+++ b/unstructured/partition/msg.py
@ -3,7 +3,7 @@ from typing import IO, Dict, List, Optional

 import msg_parser

-from unstructured.documents.elements import Element, ElementMetadata
+from unstructured.documents.elements import Element, ElementMetadata, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import exactly_one
 from unstructured.partition.email import convert_to_iso_8601
@ -11,10 +11,12 @@ from unstructured.partition.html import partition_html
 from unstructured.partition.text import partition_text


+@process_metadata()
@add_metadata_with_filetype(FileType.MSG)
 def partition_msg(
    filename: Optional[str] = None,
    file: Optional[IO] = None,
+    **kwargs,
 ) -> List[Element]:
    """Partitions a MSFT Outlook .msg file

--- a/unstructured/partition/odt.py
+++ b/unstructured/partition/odt.py
@ -1,12 +1,17 @@
 from typing import IO, List, Optional

-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.docx import convert_and_partition_docx


+@process_metadata()
@add_metadata_with_filetype(FileType.ODT)
-def partition_odt(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
+def partition_odt(
+    filename: Optional[str] = None,
+    file: Optional[IO] = None,
+    **kwargs,
+) -> List[Element]:
    """Partitions Open Office Documents in .odt format into its document elements.

    Parameters
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -9,7 +9,12 @@ from pdfminer.utils import open_filename
 from PIL import Image

 from unstructured.cleaners.core import clean_extra_whitespace
-from unstructured.documents.elements import Element, ElementMetadata, PageBreak
+from unstructured.documents.elements import (
+    Element,
+    ElementMetadata,
+    PageBreak,
+    process_metadata,
+)
 from unstructured.file_utils.filetype import (
    FileType,
    add_metadata_with_filetype,
@ -26,6 +31,7 @@ from unstructured.partition.text import element_from_text, partition_text
 from unstructured.utils import requires_dependencies


+@process_metadata()
@add_metadata_with_filetype(FileType.PDF)
 def partition_pdf(
    filename: str = "",
@ -37,6 +43,7 @@ def partition_pdf(
    strategy: str = "auto",
    infer_table_structure: bool = False,
    ocr_languages: str = "eng",
+    **kwargs,
 ) -> List[Element]:
    """Parses a pdf document into a list of interpreted elements.
    Parameters
--- a/unstructured/partition/ppt.py
+++ b/unstructured/partition/ppt.py
@ -2,17 +2,19 @@ import os
 import tempfile
 from typing import IO, List, Optional

-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import convert_office_doc, exactly_one
 from unstructured.partition.pptx import partition_pptx


+@process_metadata()
@add_metadata_with_filetype(FileType.PPT)
 def partition_ppt(
    filename: Optional[str] = None,
    file: Optional[IO] = None,
    include_page_breaks: bool = False,
+    **kwargs,
 ) -> List[Element]:
    """Partitions Microsoft PowerPoint Documents in .ppt format into their document elements.

--- a/unstructured/partition/pptx.py
+++ b/unstructured/partition/pptx.py
@ -12,6 +12,7 @@ from unstructured.documents.elements import (
    Table,
    Text,
    Title,
+    process_metadata,
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import (
@ -27,12 +28,14 @@ from unstructured.partition.text_type import (
 OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}"


+@process_metadata()
@add_metadata_with_filetype(FileType.PPTX)
 def partition_pptx(
    filename: Optional[str] = None,
    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
    include_page_breaks: bool = True,
    metadata_filename: Optional[str] = None,
+    **kwargs,
 ) -> List[Element]:
    """Partitions Microsoft PowerPoint Documents in .pptx format into its document elements.

--- a/unstructured/partition/rst.py
+++ b/unstructured/partition/rst.py
@ -1,15 +1,17 @@
 from typing import IO, List, Optional

-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.html import convert_and_partition_html


+@process_metadata()
@add_metadata_with_filetype(FileType.RST)
 def partition_rst(
    filename: Optional[str] = None,
    file: Optional[IO] = None,
    include_page_breaks: bool = False,
+    **kwargs,
 ) -> List[Element]:
    """Partitions an RST document. The document is first converted to HTML and then
    partitioned using partition_html.
--- a/unstructured/partition/rtf.py
+++ b/unstructured/partition/rtf.py
@ -1,15 +1,17 @@
 from typing import IO, List, Optional

-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.html import convert_and_partition_html


+@process_metadata()
@add_metadata_with_filetype(FileType.RTF)
 def partition_rtf(
    filename: Optional[str] = None,
    file: Optional[IO] = None,
    include_page_breaks: bool = False,
+    **kwargs,
 ) -> List[Element]:
    """Partitions an RTF document. The document is first converted to HTML and then
    partitioned using partiton_html.
--- a/unstructured/partition/text.py
+++ b/unstructured/partition/text.py
@ -10,6 +10,7 @@ from unstructured.documents.elements import (
    NarrativeText,
    Text,
    Title,
+    process_metadata,
 )
 from unstructured.file_utils.encoding import read_txt_file
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
@ -27,6 +28,7 @@ def split_by_paragraph(content: str) -> List[str]:
    return re.split(PARAGRAPH_PATTERN, content)


+@process_metadata()
@add_metadata_with_filetype(FileType.TXT)
 def partition_text(
    filename: Optional[str] = None,
@ -36,6 +38,7 @@ def partition_text(
    paragraph_grouper: Optional[Callable[[str], str]] = None,
    metadata_filename: Optional[str] = None,
    include_metadata: bool = True,
+    **kwargs,
 ) -> List[Element]:
    """Partitions an .txt documents into its constituent elements.
    Parameters
--- a/unstructured/partition/tsv.py
+++ b/unstructured/partition/tsv.py
@ -4,17 +4,24 @@ from typing import IO, BinaryIO, List, Optional, Union, cast
 import lxml.html
 import pandas as pd

-from unstructured.documents.elements import Element, ElementMetadata, Table
+from unstructured.documents.elements import (
+    Element,
+    ElementMetadata,
+    Table,
+    process_metadata,
+)
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed


+@process_metadata()
@add_metadata_with_filetype(FileType.TSV)
 def partition_tsv(
    filename: Optional[str] = None,
    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
    metadata_filename: Optional[str] = None,
    include_metadata: bool = True,
+    **kwargs,
 ) -> List[Element]:
    """Partitions TSV files into document elements.

--- a/unstructured/partition/xlsx.py
+++ b/unstructured/partition/xlsx.py
@ -4,17 +4,24 @@ from typing import IO, BinaryIO, List, Optional, Union, cast
 import lxml.html
 import pandas as pd

-from unstructured.documents.elements import Element, ElementMetadata, Table
+from unstructured.documents.elements import (
+    Element,
+    ElementMetadata,
+    Table,
+    process_metadata,
+)
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed


+@process_metadata()
@add_metadata_with_filetype(FileType.XLSX)
 def partition_xlsx(
    filename: Optional[str] = None,
    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
    metadata_filename: Optional[str] = None,
    include_metadata: bool = True,
+    **kwargs,
 ) -> List[Element]:
    """Partitions Microsoft Excel Documents in .xlsx format into its document elements.

--- a/unstructured/partition/xml.py
+++ b/unstructured/partition/xml.py
@ -1,7 +1,8 @@
 import xml.etree.ElementTree as ET
 from tempfile import SpooledTemporaryFile
-from typing import IO, BinaryIO, Optional, Union, cast
+from typing import IO, BinaryIO, List, Optional, Union, cast

+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.encoding import read_txt_file
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
@ -38,6 +39,7 @@ def get_leaf_elements(
    return "\n".join(leaf_elements)  # type: ignore


+@process_metadata()
@add_metadata_with_filetype(FileType.XML)
 def partition_xml(
    filename: Optional[str] = None,
@ -47,7 +49,8 @@ def partition_xml(
    metadata_filename: Optional[str] = None,
    include_metadata: bool = True,
    encoding: Optional[str] = None,
-):
+    **kwargs,
+) -> List[Element]:
    """Partitions an XML document into its document elements.

    Parameters