feat: add --metadata-include and --metadata-exclude parameters to unstructured-ingest (#368)

* added metadata in/exclude params * updated process_file * existing tests * remove default behavior * changelog and ci * line length * import * import * import sorted * import * type * line length * main * ci * json * dict * type ignore * lint * unit tests for process_file * lint * type changed to Optional(str) * ci * line length * added mutex check * nit
2025-12-27 07:03:52 +00:00 · 2023-03-22 03:30:53 +09:00 · 2023-03-22 03:30:53 +09:00 · c16862e7b3
commit c16862e7b3
parent d5a0fce6a0
17 changed files with 175 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -6,6 +6,7 @@

 ### Features

+* Add `--metadata-include` and `--metadata-exclude` parameters to `unstructured-ingest`
 * Add `clean_non_ascii_chars` to remove non-ascii characters from unicode string

 ### Fixes
--- a/test_unstructured_ingest/test-ingest-azure.sh
+++ b/test_unstructured_ingest/test-ingest-azure.sh
@ -4,6 +4,7 @@ SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 cd "$SCRIPT_DIR"/.. || exit 1

 PYTHONPATH=. ./unstructured/ingest/main.py \
+    --metadata-exclude filename \
    --remote-url abfs://container1/ \
    --azure-account-name azureunstructured1 \
    --structured-output-dir azure-ingest-output \
--- a/test_unstructured_ingest/test-ingest-biomed-api.sh
+++ b/test_unstructured_ingest/test-ingest-biomed-api.sh
@ -10,6 +10,7 @@ if [[ "$(find test_unstructured_ingest/expected-structured-output/biomed-ingest-
 fi

 PYTHONPATH=. ./unstructured/ingest/main.py \
+   --metadata-exclude filename \
   --biomed-api-from "2019-01-02" \
   --biomed-api-until "2019-01-02+00:03:10" \
   --structured-output-dir biomed-ingest-output-api  \
--- a/test_unstructured_ingest/test-ingest-biomed-path.sh
+++ b/test_unstructured_ingest/test-ingest-biomed-path.sh
@ -10,6 +10,7 @@ if [[ "$(find test_unstructured_ingest/expected-structured-output/biomed-ingest-
 fi

 PYTHONPATH=. ./unstructured/ingest/main.py \
+    --metadata-exclude filename \
    --biomed-path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \
    --structured-output-dir biomed-ingest-output-path \
    --num-processes 2 \
--- a/test_unstructured_ingest/test-ingest-github.sh
+++ b/test_unstructured_ingest/test-ingest-github.sh
@ -12,7 +12,12 @@ if [[ "$CI" == "true" ]]; then
 fi


-PYTHONPATH=. ./unstructured/ingest/main.py --github-url dcneiner/Downloadify --git-file-glob '*.html,*.txt' --structured-output-dir github-downloadify-output --verbose
+PYTHONPATH=. ./unstructured/ingest/main.py \
+    --metadata-exclude filename \
+    --github-url dcneiner/Downloadify \
+    --git-file-glob '*.html,*.txt' \
+    --structured-output-dir github-downloadify-output \
+    --verbose

 if ! diff -ru test_unstructured_ingest/expected-structured-output/github-downloadify github-downloadify-output ; then
   echo
--- a/test_unstructured_ingest/test-ingest-gitlab.sh
+++ b/test_unstructured_ingest/test-ingest-gitlab.sh
@ -4,6 +4,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 cd "$SCRIPT_DIR"/.. || exit 1

 PYTHONPATH=. ./unstructured/ingest/main.py \
+    --metadata-exclude filename \
    --gitlab-url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab \
    --git-file-glob '*.md,*.txt' \
    --structured-output-dir gitlab-ingest-output \
--- a/test_unstructured_ingest/test-ingest-s3.sh
+++ b/test_unstructured_ingest/test-ingest-s3.sh
@ -9,7 +9,11 @@ if [[ "$(find test_unstructured_ingest/expected-structured-output/s3-small-batch
    exit 1
 fi

-PYTHONPATH=. ./unstructured/ingest/main.py --s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ --s3-anonymous --structured-output-dir s3-small-batch-output
+PYTHONPATH=. ./unstructured/ingest/main.py \
+    --metadata-exclude filename \
+    --s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
+    --s3-anonymous \
+    --structured-output-dir s3-small-batch-output \

 if ! diff -ru test_unstructured_ingest/expected-structured-output/s3-small-batch s3-small-batch-output ; then
    echo
--- a/test_unstructured_ingest/test-ingest-wikipedia.sh
+++ b/test_unstructured_ingest/test-ingest-wikipedia.sh
@ -4,6 +4,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 cd "$SCRIPT_DIR"/.. || exit 1

 PYTHONPATH=. ./unstructured/ingest/main.py \
+    --metadata-exclude filename \
    --wikipedia-page-title "Open Source Software" \
    --structured-output-dir wikipedia-ingest-output \
    --num-processes 2 \
--- a/test_unstructured_ingest/test_interfaces.py
+++ b/test_unstructured_ingest/test_interfaces.py
@ -0,0 +1,78 @@
+import os
+import pathlib
+
+import pytest
+
+from unstructured.ingest.connector.git import GitIngestDoc, SimpleGitConfig
+
+DIRECTORY = pathlib.Path(__file__).parent.resolve()
+EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "example-docs")
+
+test_files = [
+    "layout-parser-paper-fast.jpg",
+    "layout-parser-paper-fast.pdf",
+]
+
+
+@pytest.mark.parametrize("filename", test_files)
+def test_process_file_include_filename(filename: str):
+    ingest_doc = GitIngestDoc(
+        path=filename,
+        config=SimpleGitConfig(
+          download_dir=EXAMPLE_DOCS_DIRECTORY,
+          metadata_include="filename",
+        ),
+    )
+    isd_elems = ingest_doc.process_file()
+
+    for elem in isd_elems:
+        for k in elem["metadata"]:
+            assert k == "filename"
+
+
+@pytest.mark.parametrize("filename", test_files)
+def test_process_file_include_filename_pagenum(filename: str):
+    ingest_doc = GitIngestDoc(
+        path=filename,
+        config=SimpleGitConfig(
+          download_dir=EXAMPLE_DOCS_DIRECTORY,
+          metadata_include="filename,page_number",
+        ),
+    )
+    isd_elems = ingest_doc.process_file()
+
+    for elem in isd_elems:
+        for k in elem["metadata"]:
+            assert k in ["filename", "page_number"]
+
+
+@pytest.mark.parametrize("filename", test_files)
+def test_process_file_exclude_filename(filename: str):
+    ingest_doc = GitIngestDoc(
+        path=filename,
+        config=SimpleGitConfig(
+          download_dir=EXAMPLE_DOCS_DIRECTORY,
+          metadata_exclude="filename",
+        ),
+    )
+    isd_elems = ingest_doc.process_file()
+
+    for elem in isd_elems:
+        for k in elem["metadata"]:
+            assert k != "filename"
+
+
+@pytest.mark.parametrize("filename", test_files)
+def test_process_file_exclude_filename_pagenum(filename: str):
+    ingest_doc = GitIngestDoc(
+        path=filename,
+        config=SimpleGitConfig(
+          download_dir=EXAMPLE_DOCS_DIRECTORY,
+          metadata_exclude="filename,page_number",
+        ),
+    )
+    isd_elems = ingest_doc.process_file()
+
+    for elem in isd_elems:
+        for k in elem["metadata"]:
+            assert k not in ["filename", "page_number"]
--- a/unstructured/ingest/connector/biomed.py
+++ b/unstructured/ingest/connector/biomed.py
@ -5,7 +5,7 @@ from dataclasses import dataclass
 from datetime import datetime
 from ftplib import FTP, error_perm
 from pathlib import Path
-from typing import List, Union
+from typing import List, Optional, Union

 import requests
 from bs4 import BeautifulSoup
@ -48,6 +48,8 @@ class SimpleBiomedConfig(BaseConnectorConfig):
    output_dir: str
    re_download: bool = False
    preserve_downloads: bool = False
+    metadata_include: Optional[str] = None
+    metadata_exclude: Optional[str] = None

    def _validate_date_args(self, date):
        date_formats = ["%Y-%m-%d", "%Y-%m-%d+%H:%M:%S"]
--- a/unstructured/ingest/connector/fsspec.py
+++ b/unstructured/ingest/connector/fsspec.py
@ -3,7 +3,7 @@ import os
 import re
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Type
+from typing import Optional, Type

 from unstructured.ingest.interfaces import (
    BaseConnector,
@ -29,6 +29,8 @@ class SimpleFsspecConfig(BaseConnectorConfig):
    output_dir: str
    preserve_downloads: bool = False
    re_download: bool = False
+    metadata_include: Optional[str] = None
+    metadata_exclude: Optional[str] = None

    # fsspec specific options
    access_kwargs: dict = field(default_factory=dict)
--- a/unstructured/ingest/connector/git.py
+++ b/unstructured/ingest/connector/git.py
@ -26,6 +26,8 @@ class SimpleGitConfig(BaseConnectorConfig):
    output_dir: str
    preserve_downloads: bool = False
    re_download: bool = False
+    metadata_include: Optional[str] = None
+    metadata_exclude: Optional[str] = None

    repo_path: str = field(init=False, repr=False)

--- a/unstructured/ingest/connector/google_drive.py
+++ b/unstructured/ingest/connector/google_drive.py
@ -4,7 +4,7 @@ import os
 from dataclasses import dataclass
 from mimetypes import guess_extension
 from pathlib import Path
-from typing import Dict
+from typing import Dict, Optional

 from unstructured.file_utils.filetype import EXT_TO_FILETYPE
 from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
@ -77,6 +77,8 @@ class SimpleGoogleDriveConfig(BaseConnectorConfig):
    output_dir: str
    re_download: bool = False
    preserve_downloads: bool = False
+    metadata_include: Optional[str] = None
+    metadata_exclude: Optional[str] = None

    recursive: bool = False

--- a/unstructured/ingest/connector/reddit.py
+++ b/unstructured/ingest/connector/reddit.py
@ -2,7 +2,7 @@ import json
 import os
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional

 from unstructured.ingest.interfaces import (
    BaseConnector,
@ -31,6 +31,8 @@ class SimpleRedditConfig(BaseConnectorConfig):
    output_dir: str
    preserve_downloads: bool = False
    re_download: bool = False
+    metadata_include: Optional[str] = None
+    metadata_exclude: Optional[str] = None

    def __post_init__(self):
        if self.num_posts <= 0:
--- a/unstructured/ingest/connector/wikipedia.py
+++ b/unstructured/ingest/connector/wikipedia.py
@ -2,7 +2,7 @@ import json
 import os
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional

 from unstructured.ingest.interfaces import (
    BaseConnector,
@ -26,6 +26,8 @@ class SimpleWikipediaConfig(BaseConnectorConfig):
    output_dir: str
    preserve_downloads: bool = False
    re_download: bool = False
+    metadata_include: Optional[str] = None
+    metadata_exclude: Optional[str] = None


@dataclass
--- a/unstructured/ingest/interfaces.py
+++ b/unstructured/ingest/interfaces.py
@ -2,6 +2,7 @@
 through Unstructured."""

 from abc import ABC, abstractmethod
+from typing import Optional

 from unstructured.ingest.logger import logger
 from unstructured.partition.auto import partition
@ -47,6 +48,8 @@ class BaseConnectorConfig(ABC):
    # where to write structured data outputs
    output_dir: str
    re_download: bool = False
+    metadata_include: Optional[str] = None
+    metadata_exclude: Optional[str] = None


 class BaseIngestDoc(ABC):
@ -58,6 +61,8 @@ class BaseIngestDoc(ABC):
    Crucially, it is not responsible for the actual processing of the raw document.
    """

+    config: BaseConnectorConfig
+
    @property
    @abstractmethod
    def filename(self):
@ -94,7 +99,24 @@ class BaseIngestDoc(ABC):
        self.isd_elems_no_filename = []
        for elem in isd_elems:
            # type: ignore
-            elem["metadata"].pop("filename", None)  # type: ignore[attr-defined]
+            if (
+                self.config.metadata_exclude is not None
+                and self.config.metadata_include is not None
+            ):
+                raise ValueError(
+                    "Arguments `--metadata-include` and `--metadata-exclude` are "
+                    "mutually exclusive with each other.",
+                )
+            elif self.config.metadata_exclude is not None:
+                ex_list = self.config.metadata_exclude.split(",")
+                for ex in ex_list:
+                    elem["metadata"].pop(ex, None)  # type: ignore[attr-defined]
+            elif self.config.metadata_include is not None:
+                in_list = self.config.metadata_include.split(",")
+                for k in elem["metadata"]:
+                    if k not in in_list:
+                        elem["metadata"].pop(k, None)  # type: ignore[attr-defined]
+
            elem.pop("coordinates")  # type: ignore[attr-defined]
            self.isd_elems_no_filename.append(elem)

--- a/unstructured/ingest/main.py
+++ b/unstructured/ingest/main.py
@ -103,6 +103,20 @@ class MainProcess:


@click.command()
+@click.option(
+    "--metadata-include",
+    default=None,
+    help="If set, include the specified metadata fields if they exist and drop all other fields. "
+    "Usage: provide a single string with comma separated values. "
+    "Example: --metadata-include filename,page_number ",
+)
+@click.option(
+    "--metadata-exclude",
+    default=None,
+    help="If set, drop the specified metadata fields if they exist. "
+    "Usage: provide a single string with comma separated values. "
+    "Example: --metadata-exclude filename,page_number ",
+)
@click.option(
    "--remote-url",
    default=None,
@ -322,7 +336,15 @@ def main(
    reprocess,
    num_processes,
    verbose,
+    metadata_include,
+    metadata_exclude,
 ):
+    if metadata_exclude is not None and metadata_include is not None:
+        logger.error(
+            "Arguments `--metadata-include` and `--metadata-exclude` are "
+            "mutually exclusive with each other.",
+        )
+        sys.exit(1)
    if not preserve_downloads and download_dir:
        logger.warning(
            "Not preserving downloaded files but --download_dir is specified",
@ -391,6 +413,8 @@ def main(
                    output_dir=structured_output_dir,
                    re_download=re_download,
                    preserve_downloads=preserve_downloads,
+                    metadata_include=metadata_include,
+                    metadata_exclude=metadata_exclude,
                ),
            )
        elif protocol in ("abfs", "az"):
@ -411,6 +435,8 @@ def main(
                    output_dir=structured_output_dir,
                    re_download=re_download,
                    preserve_downloads=preserve_downloads,
+                    metadata_include=metadata_include,
+                    metadata_exclude=metadata_exclude,
                ),
            )
        else:
@ -427,6 +453,8 @@ def main(
                    output_dir=structured_output_dir,
                    re_download=re_download,
                    preserve_downloads=preserve_downloads,
+                    metadata_include=metadata_include,
+                    metadata_exclude=metadata_exclude,
                ),
            )
    elif github_url:
@ -441,6 +469,8 @@ def main(
                preserve_downloads=preserve_downloads,
                output_dir=structured_output_dir,
                re_download=re_download,
+                metadata_include=metadata_include,
+                metadata_exclude=metadata_exclude,
            ),
        )
    elif gitlab_url:
@ -455,6 +485,8 @@ def main(
                preserve_downloads=preserve_downloads,
                output_dir=structured_output_dir,
                re_download=re_download,
+                metadata_include=metadata_include,
+                metadata_exclude=metadata_exclude,
            ),
        )
    elif subreddit_name:
@ -471,6 +503,8 @@ def main(
                preserve_downloads=preserve_downloads,
                output_dir=structured_output_dir,
                re_download=re_download,
+                metadata_include=metadata_include,
+                metadata_exclude=metadata_exclude,
            ),
        )
    elif wikipedia_page_title:
@ -483,6 +517,8 @@ def main(
                preserve_downloads=preserve_downloads,
                output_dir=structured_output_dir,
                re_download=re_download,
+                metadata_include=metadata_include,
+                metadata_exclude=metadata_exclude,
            ),
        )
    elif drive_id:
@ -497,6 +533,8 @@ def main(
                preserve_downloads=preserve_downloads,
                output_dir=structured_output_dir,
                re_download=re_download,
+                metadata_include=metadata_include,
+                metadata_exclude=metadata_exclude,
            ),
        )
    elif biomed_path or biomed_api_id or biomed_api_from or biomed_api_until:
@ -511,6 +549,8 @@ def main(
                preserve_downloads=preserve_downloads,
                output_dir=structured_output_dir,
                re_download=re_download,
+                metadata_include=metadata_include,
+                metadata_exclude=metadata_exclude,
            ),
        )
    # Check for other connector-specific options here and define the doc_connector object