feat: add --metadata-include and --metadata-exclude parameters to unstructured-ingest (#368)

* added metadata in/exclude params

* updated process_file

* existing tests

* remove default behavior

* changelog and ci

* line length

* import

* import

* import sorted

* import

* type

* line length

* main

* ci

* json

* dict

* type ignore

* lint

* unit tests for process_file

* lint

* type changed to Optional(str)

* ci

* line length

* added mutex check

* nit
This commit is contained in:
natygyoon 2023-03-22 03:30:53 +09:00 committed by GitHub
parent d5a0fce6a0
commit c16862e7b3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 175 additions and 8 deletions

View File

@ -6,6 +6,7 @@
### Features
* Add `--metadata-include` and `--metadata-exclude` parameters to `unstructured-ingest`
* Add `clean_non_ascii_chars` to remove non-ascii characters from unicode string
### Fixes

View File

@ -4,6 +4,7 @@ SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \
--metadata-exclude filename \
--remote-url abfs://container1/ \
--azure-account-name azureunstructured1 \
--structured-output-dir azure-ingest-output \

View File

@ -10,6 +10,7 @@ if [[ "$(find test_unstructured_ingest/expected-structured-output/biomed-ingest-
fi
PYTHONPATH=. ./unstructured/ingest/main.py \
--metadata-exclude filename \
--biomed-api-from "2019-01-02" \
--biomed-api-until "2019-01-02+00:03:10" \
--structured-output-dir biomed-ingest-output-api \

View File

@ -10,6 +10,7 @@ if [[ "$(find test_unstructured_ingest/expected-structured-output/biomed-ingest-
fi
PYTHONPATH=. ./unstructured/ingest/main.py \
--metadata-exclude filename \
--biomed-path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \
--structured-output-dir biomed-ingest-output-path \
--num-processes 2 \

View File

@ -12,7 +12,12 @@ if [[ "$CI" == "true" ]]; then
fi
PYTHONPATH=. ./unstructured/ingest/main.py --github-url dcneiner/Downloadify --git-file-glob '*.html,*.txt' --structured-output-dir github-downloadify-output --verbose
PYTHONPATH=. ./unstructured/ingest/main.py \
--metadata-exclude filename \
--github-url dcneiner/Downloadify \
--git-file-glob '*.html,*.txt' \
--structured-output-dir github-downloadify-output \
--verbose
if ! diff -ru test_unstructured_ingest/expected-structured-output/github-downloadify github-downloadify-output ; then
echo

View File

@ -4,6 +4,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \
--metadata-exclude filename \
--gitlab-url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab \
--git-file-glob '*.md,*.txt' \
--structured-output-dir gitlab-ingest-output \

View File

@ -9,7 +9,11 @@ if [[ "$(find test_unstructured_ingest/expected-structured-output/s3-small-batch
exit 1
fi
PYTHONPATH=. ./unstructured/ingest/main.py --s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ --s3-anonymous --structured-output-dir s3-small-batch-output
PYTHONPATH=. ./unstructured/ingest/main.py \
--metadata-exclude filename \
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
--s3-anonymous \
--structured-output-dir s3-small-batch-output \
if ! diff -ru test_unstructured_ingest/expected-structured-output/s3-small-batch s3-small-batch-output ; then
echo

View File

@ -4,6 +4,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \
--metadata-exclude filename \
--wikipedia-page-title "Open Source Software" \
--structured-output-dir wikipedia-ingest-output \
--num-processes 2 \

View File

@ -0,0 +1,78 @@
import os
import pathlib
import pytest
from unstructured.ingest.connector.git import GitIngestDoc, SimpleGitConfig
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "example-docs")
test_files = [
"layout-parser-paper-fast.jpg",
"layout-parser-paper-fast.pdf",
]
@pytest.mark.parametrize("filename", test_files)
def test_process_file_include_filename(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
download_dir=EXAMPLE_DOCS_DIRECTORY,
metadata_include="filename",
),
)
isd_elems = ingest_doc.process_file()
for elem in isd_elems:
for k in elem["metadata"]:
assert k == "filename"
@pytest.mark.parametrize("filename", test_files)
def test_process_file_include_filename_pagenum(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
download_dir=EXAMPLE_DOCS_DIRECTORY,
metadata_include="filename,page_number",
),
)
isd_elems = ingest_doc.process_file()
for elem in isd_elems:
for k in elem["metadata"]:
assert k in ["filename", "page_number"]
@pytest.mark.parametrize("filename", test_files)
def test_process_file_exclude_filename(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
download_dir=EXAMPLE_DOCS_DIRECTORY,
metadata_exclude="filename",
),
)
isd_elems = ingest_doc.process_file()
for elem in isd_elems:
for k in elem["metadata"]:
assert k != "filename"
@pytest.mark.parametrize("filename", test_files)
def test_process_file_exclude_filename_pagenum(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
download_dir=EXAMPLE_DOCS_DIRECTORY,
metadata_exclude="filename,page_number",
),
)
isd_elems = ingest_doc.process_file()
for elem in isd_elems:
for k in elem["metadata"]:
assert k not in ["filename", "page_number"]

View File

@ -5,7 +5,7 @@ from dataclasses import dataclass
from datetime import datetime
from ftplib import FTP, error_perm
from pathlib import Path
from typing import List, Union
from typing import List, Optional, Union
import requests
from bs4 import BeautifulSoup
@ -48,6 +48,8 @@ class SimpleBiomedConfig(BaseConnectorConfig):
output_dir: str
re_download: bool = False
preserve_downloads: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
def _validate_date_args(self, date):
date_formats = ["%Y-%m-%d", "%Y-%m-%d+%H:%M:%S"]

View File

@ -3,7 +3,7 @@ import os
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Type
from typing import Optional, Type
from unstructured.ingest.interfaces import (
BaseConnector,
@ -29,6 +29,8 @@ class SimpleFsspecConfig(BaseConnectorConfig):
output_dir: str
preserve_downloads: bool = False
re_download: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
# fsspec specific options
access_kwargs: dict = field(default_factory=dict)

View File

@ -26,6 +26,8 @@ class SimpleGitConfig(BaseConnectorConfig):
output_dir: str
preserve_downloads: bool = False
re_download: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
repo_path: str = field(init=False, repr=False)

View File

@ -4,7 +4,7 @@ import os
from dataclasses import dataclass
from mimetypes import guess_extension
from pathlib import Path
from typing import Dict
from typing import Dict, Optional
from unstructured.file_utils.filetype import EXT_TO_FILETYPE
from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
@ -77,6 +77,8 @@ class SimpleGoogleDriveConfig(BaseConnectorConfig):
output_dir: str
re_download: bool = False
preserve_downloads: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
recursive: bool = False

View File

@ -2,7 +2,7 @@ import json
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Optional
from unstructured.ingest.interfaces import (
BaseConnector,
@ -31,6 +31,8 @@ class SimpleRedditConfig(BaseConnectorConfig):
output_dir: str
preserve_downloads: bool = False
re_download: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
def __post_init__(self):
if self.num_posts <= 0:

View File

@ -2,7 +2,7 @@ import json
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Optional
from unstructured.ingest.interfaces import (
BaseConnector,
@ -26,6 +26,8 @@ class SimpleWikipediaConfig(BaseConnectorConfig):
output_dir: str
preserve_downloads: bool = False
re_download: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
@dataclass

View File

@ -2,6 +2,7 @@
through Unstructured."""
from abc import ABC, abstractmethod
from typing import Optional
from unstructured.ingest.logger import logger
from unstructured.partition.auto import partition
@ -47,6 +48,8 @@ class BaseConnectorConfig(ABC):
# where to write structured data outputs
output_dir: str
re_download: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
class BaseIngestDoc(ABC):
@ -58,6 +61,8 @@ class BaseIngestDoc(ABC):
Crucially, it is not responsible for the actual processing of the raw document.
"""
config: BaseConnectorConfig
@property
@abstractmethod
def filename(self):
@ -94,7 +99,24 @@ class BaseIngestDoc(ABC):
self.isd_elems_no_filename = []
for elem in isd_elems:
# type: ignore
elem["metadata"].pop("filename", None) # type: ignore[attr-defined]
if (
self.config.metadata_exclude is not None
and self.config.metadata_include is not None
):
raise ValueError(
"Arguments `--metadata-include` and `--metadata-exclude` are "
"mutually exclusive with each other.",
)
elif self.config.metadata_exclude is not None:
ex_list = self.config.metadata_exclude.split(",")
for ex in ex_list:
elem["metadata"].pop(ex, None) # type: ignore[attr-defined]
elif self.config.metadata_include is not None:
in_list = self.config.metadata_include.split(",")
for k in elem["metadata"]:
if k not in in_list:
elem["metadata"].pop(k, None) # type: ignore[attr-defined]
elem.pop("coordinates") # type: ignore[attr-defined]
self.isd_elems_no_filename.append(elem)

View File

@ -103,6 +103,20 @@ class MainProcess:
@click.command()
@click.option(
"--metadata-include",
default=None,
help="If set, include the specified metadata fields if they exist and drop all other fields. "
"Usage: provide a single string with comma separated values. "
"Example: --metadata-include filename,page_number ",
)
@click.option(
"--metadata-exclude",
default=None,
help="If set, drop the specified metadata fields if they exist. "
"Usage: provide a single string with comma separated values. "
"Example: --metadata-exclude filename,page_number ",
)
@click.option(
"--remote-url",
default=None,
@ -322,7 +336,15 @@ def main(
reprocess,
num_processes,
verbose,
metadata_include,
metadata_exclude,
):
if metadata_exclude is not None and metadata_include is not None:
logger.error(
"Arguments `--metadata-include` and `--metadata-exclude` are "
"mutually exclusive with each other.",
)
sys.exit(1)
if not preserve_downloads and download_dir:
logger.warning(
"Not preserving downloaded files but --download_dir is specified",
@ -391,6 +413,8 @@ def main(
output_dir=structured_output_dir,
re_download=re_download,
preserve_downloads=preserve_downloads,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
),
)
elif protocol in ("abfs", "az"):
@ -411,6 +435,8 @@ def main(
output_dir=structured_output_dir,
re_download=re_download,
preserve_downloads=preserve_downloads,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
),
)
else:
@ -427,6 +453,8 @@ def main(
output_dir=structured_output_dir,
re_download=re_download,
preserve_downloads=preserve_downloads,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
),
)
elif github_url:
@ -441,6 +469,8 @@ def main(
preserve_downloads=preserve_downloads,
output_dir=structured_output_dir,
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
),
)
elif gitlab_url:
@ -455,6 +485,8 @@ def main(
preserve_downloads=preserve_downloads,
output_dir=structured_output_dir,
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
),
)
elif subreddit_name:
@ -471,6 +503,8 @@ def main(
preserve_downloads=preserve_downloads,
output_dir=structured_output_dir,
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
),
)
elif wikipedia_page_title:
@ -483,6 +517,8 @@ def main(
preserve_downloads=preserve_downloads,
output_dir=structured_output_dir,
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
),
)
elif drive_id:
@ -497,6 +533,8 @@ def main(
preserve_downloads=preserve_downloads,
output_dir=structured_output_dir,
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
),
)
elif biomed_path or biomed_api_id or biomed_api_from or biomed_api_until:
@ -511,6 +549,8 @@ def main(
preserve_downloads=preserve_downloads,
output_dir=structured_output_dir,
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
),
)
# Check for other connector-specific options here and define the doc_connector object