feat: add --flatten-metadata to unstructured-ingest (#389)

* added --flatten-metadata to unstructured-ingest

* added unit tests for process_file()
This commit is contained in:
natygyoon 2023-03-23 05:52:56 +09:00 committed by GitHub
parent 66a0369fb6
commit a4394f6f16
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 80 additions and 12 deletions

View File

@ -1,4 +1,4 @@
## 0.5.7-dev1
## 0.5.7-dev2
### Enhancements
@ -6,6 +6,7 @@
### Features
* Add `--flatten-metadata` parameter to `unstructured-ingest`
* Add `--fields-include` parameter to `unstructured-ingest`
### Fixes

View File

@ -26,8 +26,7 @@ def test_process_file_metadata_include_filename(filename: str):
isd_elems = ingest_doc.process_file()
for elem in isd_elems:
for k in elem["metadata"]:
assert k == "filename"
assert set(elem["metadata"].keys()) == {"filename"}
@pytest.mark.parametrize("filename", test_files)
@ -42,8 +41,7 @@ def test_process_file_metadata_include_filename_pagenum(filename: str):
isd_elems = ingest_doc.process_file()
for elem in isd_elems:
for k in elem["metadata"]:
assert k in ["filename", "page_number"]
assert set(elem["metadata"].keys()) == {"filename", "page_number"}
@pytest.mark.parametrize("filename", test_files)
@ -58,8 +56,7 @@ def test_process_file_metadata_exclude_filename(filename: str):
isd_elems = ingest_doc.process_file()
for elem in isd_elems:
for k in elem["metadata"]:
assert k != "filename"
assert "filename" not in elem["metadata"].keys()
@pytest.mark.parametrize("filename", test_files)
@ -74,8 +71,8 @@ def test_process_file_metadata_exclude_filename_pagenum(filename: str):
isd_elems = ingest_doc.process_file()
for elem in isd_elems:
for k in elem["metadata"]:
assert k not in ["filename", "page_number"]
assert "filename" not in elem["metadata"].keys()
assert "page_number" not in elem["metadata"].keys()
@pytest.mark.parametrize("filename", test_files)
@ -87,7 +84,9 @@ def test_process_file_fields_include_default(filename: str):
),
)
isd_elems = ingest_doc.process_file()
assert set("element_id", "text", "type", "metadata") == set(elem.keys())
for elem in isd_elems:
assert {"element_id", "text", "type", "metadata"} == set(elem.keys())
@pytest.mark.parametrize("filename", test_files)
@ -100,5 +99,38 @@ def test_process_file_fields_include_elementid(filename: str):
),
)
isd_elems = ingest_doc.process_file()
assert set("element_id") == set(elem.keys())
for elem in isd_elems:
assert {"element_id"} == set(elem.keys())
@pytest.mark.parametrize("filename", test_files)
def test_process_file_flatten_metadata_filename(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
download_dir=EXAMPLE_DOCS_DIRECTORY,
metadata_include="filename",
flatten_metadata=True,
),
)
isd_elems = ingest_doc.process_file()
for elem in isd_elems:
assert {"element_id", "text", "type", "filename"} == set(elem.keys())
@pytest.mark.parametrize("filename", test_files)
def test_process_file_flatten_metadata_filename_pagenum(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
download_dir=EXAMPLE_DOCS_DIRECTORY,
metadata_include="filename,page_number",
flatten_metadata=True,
),
)
isd_elems = ingest_doc.process_file()
for elem in isd_elems:
assert {"element_id", "text", "type", "filename", "page_number"} == set(elem.keys())

View File

@ -1 +1 @@
__version__ = "0.5.7-dev1" # pragma: no cover
__version__ = "0.5.7-dev2" # pragma: no cover

View File

@ -51,6 +51,7 @@ class SimpleBiomedConfig(BaseConnectorConfig):
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False
def _validate_date_args(self, date):
date_formats = ["%Y-%m-%d", "%Y-%m-%d+%H:%M:%S"]

View File

@ -32,6 +32,7 @@ class SimpleFsspecConfig(BaseConnectorConfig):
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False
# fsspec specific options
access_kwargs: dict = field(default_factory=dict)

View File

@ -29,6 +29,7 @@ class SimpleGitConfig(BaseConnectorConfig):
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False
repo_path: str = field(init=False, repr=False)

View File

@ -80,6 +80,7 @@ class SimpleGoogleDriveConfig(BaseConnectorConfig):
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False
recursive: bool = False

View File

@ -34,6 +34,7 @@ class SimpleRedditConfig(BaseConnectorConfig):
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False
def __post_init__(self):
if self.num_posts <= 0:

View File

@ -29,6 +29,7 @@ class SimpleWikipediaConfig(BaseConnectorConfig):
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False
@dataclass

View File

@ -51,6 +51,7 @@ class BaseConnectorConfig(ABC):
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False
class BaseIngestDoc(ABC):
@ -121,6 +122,11 @@ class BaseIngestDoc(ABC):
in_list = self.config.fields_include.split(",")
elem = {k: v for k, v in elem.items() if k in in_list}
if self.config.flatten_metadata:
for k, v in elem["metadata"].items(): # type: ignore[attr-defined]
elem[k] = v
elem.pop("metadata") # type: ignore[attr-defined]
self.isd_elems_no_filename.append(elem)
return self.isd_elems_no_filename

View File

@ -103,6 +103,14 @@ class MainProcess:
@click.command()
@click.option(
"--flatten-metadata",
is_flag=True,
default=False,
help="Results in flattened json elements. "
"Specifically, the metadata key values are brought to the top-level of the element, "
"and the `metadata` key itself is removed.",
)
@click.option(
"--fields-include",
default="element_id,text,type,metadata",
@ -345,7 +353,13 @@ def main(
metadata_include,
metadata_exclude,
fields_include,
flatten_metadata,
):
if flatten_metadata and "metadata" not in fields_include:
logger.warning(
"`--flatten-metadata` is specified, but there is no metadata to flatten, "
"since `metadata` is not specified in `--fields-include`.",
)
if "metadata" not in fields_include and (metadata_include or metadata_exclude):
logger.warning(
"Either `--metadata-include` or `--metadata-exclude` is specified"
@ -428,6 +442,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
elif protocol in ("abfs", "az"):
@ -451,6 +466,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
else:
@ -470,6 +486,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
elif github_url:
@ -487,6 +504,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
elif gitlab_url:
@ -504,6 +522,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
elif subreddit_name:
@ -523,6 +542,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
elif wikipedia_page_title:
@ -538,6 +558,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
elif drive_id:
@ -555,6 +576,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
elif biomed_path or biomed_api_id or biomed_api_from or biomed_api_until:
@ -572,6 +594,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
# Check for other connector-specific options here and define the doc_connector object