mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-26 14:45:31 +00:00
feat: add --metadata-include and --metadata-exclude parameters to unstructured-ingest (#368)
* added metadata in/exclude params * updated process_file * existing tests * remove default behavior * changelog and ci * line length * import * import * import sorted * import * type * line length * main * ci * json * dict * type ignore * lint * unit tests for process_file * lint * type changed to Optional(str) * ci * line length * added mutex check * nit
This commit is contained in:
parent
d5a0fce6a0
commit
c16862e7b3
@ -6,6 +6,7 @@
|
||||
|
||||
### Features
|
||||
|
||||
* Add `--metadata-include` and `--metadata-exclude` parameters to `unstructured-ingest`
|
||||
* Add `clean_non_ascii_chars` to remove non-ascii characters from unicode string
|
||||
|
||||
### Fixes
|
||||
|
||||
@ -4,6 +4,7 @@ SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--metadata-exclude filename \
|
||||
--remote-url abfs://container1/ \
|
||||
--azure-account-name azureunstructured1 \
|
||||
--structured-output-dir azure-ingest-output \
|
||||
|
||||
@ -10,6 +10,7 @@ if [[ "$(find test_unstructured_ingest/expected-structured-output/biomed-ingest-
|
||||
fi
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--metadata-exclude filename \
|
||||
--biomed-api-from "2019-01-02" \
|
||||
--biomed-api-until "2019-01-02+00:03:10" \
|
||||
--structured-output-dir biomed-ingest-output-api \
|
||||
|
||||
@ -10,6 +10,7 @@ if [[ "$(find test_unstructured_ingest/expected-structured-output/biomed-ingest-
|
||||
fi
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--metadata-exclude filename \
|
||||
--biomed-path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \
|
||||
--structured-output-dir biomed-ingest-output-path \
|
||||
--num-processes 2 \
|
||||
|
||||
@ -12,7 +12,12 @@ if [[ "$CI" == "true" ]]; then
|
||||
fi
|
||||
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py --github-url dcneiner/Downloadify --git-file-glob '*.html,*.txt' --structured-output-dir github-downloadify-output --verbose
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--metadata-exclude filename \
|
||||
--github-url dcneiner/Downloadify \
|
||||
--git-file-glob '*.html,*.txt' \
|
||||
--structured-output-dir github-downloadify-output \
|
||||
--verbose
|
||||
|
||||
if ! diff -ru test_unstructured_ingest/expected-structured-output/github-downloadify github-downloadify-output ; then
|
||||
echo
|
||||
|
||||
@ -4,6 +4,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--metadata-exclude filename \
|
||||
--gitlab-url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab \
|
||||
--git-file-glob '*.md,*.txt' \
|
||||
--structured-output-dir gitlab-ingest-output \
|
||||
|
||||
@ -9,7 +9,11 @@ if [[ "$(find test_unstructured_ingest/expected-structured-output/s3-small-batch
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py --s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ --s3-anonymous --structured-output-dir s3-small-batch-output
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--metadata-exclude filename \
|
||||
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
|
||||
--s3-anonymous \
|
||||
--structured-output-dir s3-small-batch-output \
|
||||
|
||||
if ! diff -ru test_unstructured_ingest/expected-structured-output/s3-small-batch s3-small-batch-output ; then
|
||||
echo
|
||||
|
||||
@ -4,6 +4,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--metadata-exclude filename \
|
||||
--wikipedia-page-title "Open Source Software" \
|
||||
--structured-output-dir wikipedia-ingest-output \
|
||||
--num-processes 2 \
|
||||
|
||||
78
test_unstructured_ingest/test_interfaces.py
Normal file
78
test_unstructured_ingest/test_interfaces.py
Normal file
@ -0,0 +1,78 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import pytest
|
||||
|
||||
from unstructured.ingest.connector.git import GitIngestDoc, SimpleGitConfig
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "example-docs")
|
||||
|
||||
test_files = [
|
||||
"layout-parser-paper-fast.jpg",
|
||||
"layout-parser-paper-fast.pdf",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
def test_process_file_include_filename(filename: str):
|
||||
ingest_doc = GitIngestDoc(
|
||||
path=filename,
|
||||
config=SimpleGitConfig(
|
||||
download_dir=EXAMPLE_DOCS_DIRECTORY,
|
||||
metadata_include="filename",
|
||||
),
|
||||
)
|
||||
isd_elems = ingest_doc.process_file()
|
||||
|
||||
for elem in isd_elems:
|
||||
for k in elem["metadata"]:
|
||||
assert k == "filename"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
def test_process_file_include_filename_pagenum(filename: str):
|
||||
ingest_doc = GitIngestDoc(
|
||||
path=filename,
|
||||
config=SimpleGitConfig(
|
||||
download_dir=EXAMPLE_DOCS_DIRECTORY,
|
||||
metadata_include="filename,page_number",
|
||||
),
|
||||
)
|
||||
isd_elems = ingest_doc.process_file()
|
||||
|
||||
for elem in isd_elems:
|
||||
for k in elem["metadata"]:
|
||||
assert k in ["filename", "page_number"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
def test_process_file_exclude_filename(filename: str):
|
||||
ingest_doc = GitIngestDoc(
|
||||
path=filename,
|
||||
config=SimpleGitConfig(
|
||||
download_dir=EXAMPLE_DOCS_DIRECTORY,
|
||||
metadata_exclude="filename",
|
||||
),
|
||||
)
|
||||
isd_elems = ingest_doc.process_file()
|
||||
|
||||
for elem in isd_elems:
|
||||
for k in elem["metadata"]:
|
||||
assert k != "filename"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
def test_process_file_exclude_filename_pagenum(filename: str):
|
||||
ingest_doc = GitIngestDoc(
|
||||
path=filename,
|
||||
config=SimpleGitConfig(
|
||||
download_dir=EXAMPLE_DOCS_DIRECTORY,
|
||||
metadata_exclude="filename,page_number",
|
||||
),
|
||||
)
|
||||
isd_elems = ingest_doc.process_file()
|
||||
|
||||
for elem in isd_elems:
|
||||
for k in elem["metadata"]:
|
||||
assert k not in ["filename", "page_number"]
|
||||
@ -5,7 +5,7 @@ from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from ftplib import FTP, error_perm
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
@ -48,6 +48,8 @@ class SimpleBiomedConfig(BaseConnectorConfig):
|
||||
output_dir: str
|
||||
re_download: bool = False
|
||||
preserve_downloads: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
|
||||
def _validate_date_args(self, date):
|
||||
date_formats = ["%Y-%m-%d", "%Y-%m-%d+%H:%M:%S"]
|
||||
|
||||
@ -3,7 +3,7 @@ import os
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Type
|
||||
from typing import Optional, Type
|
||||
|
||||
from unstructured.ingest.interfaces import (
|
||||
BaseConnector,
|
||||
@ -29,6 +29,8 @@ class SimpleFsspecConfig(BaseConnectorConfig):
|
||||
output_dir: str
|
||||
preserve_downloads: bool = False
|
||||
re_download: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
|
||||
# fsspec specific options
|
||||
access_kwargs: dict = field(default_factory=dict)
|
||||
|
||||
@ -26,6 +26,8 @@ class SimpleGitConfig(BaseConnectorConfig):
|
||||
output_dir: str
|
||||
preserve_downloads: bool = False
|
||||
re_download: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
|
||||
repo_path: str = field(init=False, repr=False)
|
||||
|
||||
|
||||
@ -4,7 +4,7 @@ import os
|
||||
from dataclasses import dataclass
|
||||
from mimetypes import guess_extension
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
from typing import Dict, Optional
|
||||
|
||||
from unstructured.file_utils.filetype import EXT_TO_FILETYPE
|
||||
from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
|
||||
@ -77,6 +77,8 @@ class SimpleGoogleDriveConfig(BaseConnectorConfig):
|
||||
output_dir: str
|
||||
re_download: bool = False
|
||||
preserve_downloads: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
|
||||
recursive: bool = False
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@ import json
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from unstructured.ingest.interfaces import (
|
||||
BaseConnector,
|
||||
@ -31,6 +31,8 @@ class SimpleRedditConfig(BaseConnectorConfig):
|
||||
output_dir: str
|
||||
preserve_downloads: bool = False
|
||||
re_download: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.num_posts <= 0:
|
||||
|
||||
@ -2,7 +2,7 @@ import json
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from unstructured.ingest.interfaces import (
|
||||
BaseConnector,
|
||||
@ -26,6 +26,8 @@ class SimpleWikipediaConfig(BaseConnectorConfig):
|
||||
output_dir: str
|
||||
preserve_downloads: bool = False
|
||||
re_download: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
through Unstructured."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.partition.auto import partition
|
||||
@ -47,6 +48,8 @@ class BaseConnectorConfig(ABC):
|
||||
# where to write structured data outputs
|
||||
output_dir: str
|
||||
re_download: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
|
||||
|
||||
class BaseIngestDoc(ABC):
|
||||
@ -58,6 +61,8 @@ class BaseIngestDoc(ABC):
|
||||
Crucially, it is not responsible for the actual processing of the raw document.
|
||||
"""
|
||||
|
||||
config: BaseConnectorConfig
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def filename(self):
|
||||
@ -94,7 +99,24 @@ class BaseIngestDoc(ABC):
|
||||
self.isd_elems_no_filename = []
|
||||
for elem in isd_elems:
|
||||
# type: ignore
|
||||
elem["metadata"].pop("filename", None) # type: ignore[attr-defined]
|
||||
if (
|
||||
self.config.metadata_exclude is not None
|
||||
and self.config.metadata_include is not None
|
||||
):
|
||||
raise ValueError(
|
||||
"Arguments `--metadata-include` and `--metadata-exclude` are "
|
||||
"mutually exclusive with each other.",
|
||||
)
|
||||
elif self.config.metadata_exclude is not None:
|
||||
ex_list = self.config.metadata_exclude.split(",")
|
||||
for ex in ex_list:
|
||||
elem["metadata"].pop(ex, None) # type: ignore[attr-defined]
|
||||
elif self.config.metadata_include is not None:
|
||||
in_list = self.config.metadata_include.split(",")
|
||||
for k in elem["metadata"]:
|
||||
if k not in in_list:
|
||||
elem["metadata"].pop(k, None) # type: ignore[attr-defined]
|
||||
|
||||
elem.pop("coordinates") # type: ignore[attr-defined]
|
||||
self.isd_elems_no_filename.append(elem)
|
||||
|
||||
|
||||
@ -103,6 +103,20 @@ class MainProcess:
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--metadata-include",
|
||||
default=None,
|
||||
help="If set, include the specified metadata fields if they exist and drop all other fields. "
|
||||
"Usage: provide a single string with comma separated values. "
|
||||
"Example: --metadata-include filename,page_number ",
|
||||
)
|
||||
@click.option(
|
||||
"--metadata-exclude",
|
||||
default=None,
|
||||
help="If set, drop the specified metadata fields if they exist. "
|
||||
"Usage: provide a single string with comma separated values. "
|
||||
"Example: --metadata-exclude filename,page_number ",
|
||||
)
|
||||
@click.option(
|
||||
"--remote-url",
|
||||
default=None,
|
||||
@ -322,7 +336,15 @@ def main(
|
||||
reprocess,
|
||||
num_processes,
|
||||
verbose,
|
||||
metadata_include,
|
||||
metadata_exclude,
|
||||
):
|
||||
if metadata_exclude is not None and metadata_include is not None:
|
||||
logger.error(
|
||||
"Arguments `--metadata-include` and `--metadata-exclude` are "
|
||||
"mutually exclusive with each other.",
|
||||
)
|
||||
sys.exit(1)
|
||||
if not preserve_downloads and download_dir:
|
||||
logger.warning(
|
||||
"Not preserving downloaded files but --download_dir is specified",
|
||||
@ -391,6 +413,8 @@ def main(
|
||||
output_dir=structured_output_dir,
|
||||
re_download=re_download,
|
||||
preserve_downloads=preserve_downloads,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
),
|
||||
)
|
||||
elif protocol in ("abfs", "az"):
|
||||
@ -411,6 +435,8 @@ def main(
|
||||
output_dir=structured_output_dir,
|
||||
re_download=re_download,
|
||||
preserve_downloads=preserve_downloads,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
),
|
||||
)
|
||||
else:
|
||||
@ -427,6 +453,8 @@ def main(
|
||||
output_dir=structured_output_dir,
|
||||
re_download=re_download,
|
||||
preserve_downloads=preserve_downloads,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
),
|
||||
)
|
||||
elif github_url:
|
||||
@ -441,6 +469,8 @@ def main(
|
||||
preserve_downloads=preserve_downloads,
|
||||
output_dir=structured_output_dir,
|
||||
re_download=re_download,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
),
|
||||
)
|
||||
elif gitlab_url:
|
||||
@ -455,6 +485,8 @@ def main(
|
||||
preserve_downloads=preserve_downloads,
|
||||
output_dir=structured_output_dir,
|
||||
re_download=re_download,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
),
|
||||
)
|
||||
elif subreddit_name:
|
||||
@ -471,6 +503,8 @@ def main(
|
||||
preserve_downloads=preserve_downloads,
|
||||
output_dir=structured_output_dir,
|
||||
re_download=re_download,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
),
|
||||
)
|
||||
elif wikipedia_page_title:
|
||||
@ -483,6 +517,8 @@ def main(
|
||||
preserve_downloads=preserve_downloads,
|
||||
output_dir=structured_output_dir,
|
||||
re_download=re_download,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
),
|
||||
)
|
||||
elif drive_id:
|
||||
@ -497,6 +533,8 @@ def main(
|
||||
preserve_downloads=preserve_downloads,
|
||||
output_dir=structured_output_dir,
|
||||
re_download=re_download,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
),
|
||||
)
|
||||
elif biomed_path or biomed_api_id or biomed_api_from or biomed_api_until:
|
||||
@ -511,6 +549,8 @@ def main(
|
||||
preserve_downloads=preserve_downloads,
|
||||
output_dir=structured_output_dir,
|
||||
re_download=re_download,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
),
|
||||
)
|
||||
# Check for other connector-specific options here and define the doc_connector object
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user