mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feat: add --flatten-metadata to unstructured-ingest (#389)
* added --flatten-metadata to unstructured-ingest * added unit tests for process_file()
This commit is contained in:
parent
66a0369fb6
commit
a4394f6f16
@ -1,4 +1,4 @@
|
||||
## 0.5.7-dev1
|
||||
## 0.5.7-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
|
||||
### Features
|
||||
|
||||
* Add `--flatten-metadata` parameter to `unstructured-ingest`
|
||||
* Add `--fields-include` parameter to `unstructured-ingest`
|
||||
|
||||
### Fixes
|
||||
|
@ -26,8 +26,7 @@ def test_process_file_metadata_include_filename(filename: str):
|
||||
isd_elems = ingest_doc.process_file()
|
||||
|
||||
for elem in isd_elems:
|
||||
for k in elem["metadata"]:
|
||||
assert k == "filename"
|
||||
assert set(elem["metadata"].keys()) == {"filename"}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
@ -42,8 +41,7 @@ def test_process_file_metadata_include_filename_pagenum(filename: str):
|
||||
isd_elems = ingest_doc.process_file()
|
||||
|
||||
for elem in isd_elems:
|
||||
for k in elem["metadata"]:
|
||||
assert k in ["filename", "page_number"]
|
||||
assert set(elem["metadata"].keys()) == {"filename", "page_number"}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
@ -58,8 +56,7 @@ def test_process_file_metadata_exclude_filename(filename: str):
|
||||
isd_elems = ingest_doc.process_file()
|
||||
|
||||
for elem in isd_elems:
|
||||
for k in elem["metadata"]:
|
||||
assert k != "filename"
|
||||
assert "filename" not in elem["metadata"].keys()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
@ -74,8 +71,8 @@ def test_process_file_metadata_exclude_filename_pagenum(filename: str):
|
||||
isd_elems = ingest_doc.process_file()
|
||||
|
||||
for elem in isd_elems:
|
||||
for k in elem["metadata"]:
|
||||
assert k not in ["filename", "page_number"]
|
||||
assert "filename" not in elem["metadata"].keys()
|
||||
assert "page_number" not in elem["metadata"].keys()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
@ -87,7 +84,9 @@ def test_process_file_fields_include_default(filename: str):
|
||||
),
|
||||
)
|
||||
isd_elems = ingest_doc.process_file()
|
||||
assert set("element_id", "text", "type", "metadata") == set(elem.keys())
|
||||
|
||||
for elem in isd_elems:
|
||||
assert {"element_id", "text", "type", "metadata"} == set(elem.keys())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
@ -100,5 +99,38 @@ def test_process_file_fields_include_elementid(filename: str):
|
||||
),
|
||||
)
|
||||
isd_elems = ingest_doc.process_file()
|
||||
assert set("element_id") == set(elem.keys())
|
||||
|
||||
for elem in isd_elems:
|
||||
assert {"element_id"} == set(elem.keys())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
def test_process_file_flatten_metadata_filename(filename: str):
|
||||
ingest_doc = GitIngestDoc(
|
||||
path=filename,
|
||||
config=SimpleGitConfig(
|
||||
download_dir=EXAMPLE_DOCS_DIRECTORY,
|
||||
metadata_include="filename",
|
||||
flatten_metadata=True,
|
||||
),
|
||||
)
|
||||
isd_elems = ingest_doc.process_file()
|
||||
|
||||
for elem in isd_elems:
|
||||
assert {"element_id", "text", "type", "filename"} == set(elem.keys())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
def test_process_file_flatten_metadata_filename_pagenum(filename: str):
|
||||
ingest_doc = GitIngestDoc(
|
||||
path=filename,
|
||||
config=SimpleGitConfig(
|
||||
download_dir=EXAMPLE_DOCS_DIRECTORY,
|
||||
metadata_include="filename,page_number",
|
||||
flatten_metadata=True,
|
||||
),
|
||||
)
|
||||
isd_elems = ingest_doc.process_file()
|
||||
|
||||
for elem in isd_elems:
|
||||
assert {"element_id", "text", "type", "filename", "page_number"} == set(elem.keys())
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.5.7-dev1" # pragma: no cover
|
||||
__version__ = "0.5.7-dev2" # pragma: no cover
|
||||
|
@ -51,6 +51,7 @@ class SimpleBiomedConfig(BaseConnectorConfig):
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
|
||||
def _validate_date_args(self, date):
|
||||
date_formats = ["%Y-%m-%d", "%Y-%m-%d+%H:%M:%S"]
|
||||
|
@ -32,6 +32,7 @@ class SimpleFsspecConfig(BaseConnectorConfig):
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
|
||||
# fsspec specific options
|
||||
access_kwargs: dict = field(default_factory=dict)
|
||||
|
@ -29,6 +29,7 @@ class SimpleGitConfig(BaseConnectorConfig):
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
|
||||
repo_path: str = field(init=False, repr=False)
|
||||
|
||||
|
@ -80,6 +80,7 @@ class SimpleGoogleDriveConfig(BaseConnectorConfig):
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
|
||||
recursive: bool = False
|
||||
|
||||
|
@ -34,6 +34,7 @@ class SimpleRedditConfig(BaseConnectorConfig):
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
|
||||
def __post_init__(self):
|
||||
if self.num_posts <= 0:
|
||||
|
@ -29,6 +29,7 @@ class SimpleWikipediaConfig(BaseConnectorConfig):
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -51,6 +51,7 @@ class BaseConnectorConfig(ABC):
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
|
||||
|
||||
class BaseIngestDoc(ABC):
|
||||
@ -121,6 +122,11 @@ class BaseIngestDoc(ABC):
|
||||
in_list = self.config.fields_include.split(",")
|
||||
elem = {k: v for k, v in elem.items() if k in in_list}
|
||||
|
||||
if self.config.flatten_metadata:
|
||||
for k, v in elem["metadata"].items(): # type: ignore[attr-defined]
|
||||
elem[k] = v
|
||||
elem.pop("metadata") # type: ignore[attr-defined]
|
||||
|
||||
self.isd_elems_no_filename.append(elem)
|
||||
|
||||
return self.isd_elems_no_filename
|
||||
|
@ -103,6 +103,14 @@ class MainProcess:
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--flatten-metadata",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Results in flattened json elements. "
|
||||
"Specifically, the metadata key values are brought to the top-level of the element, "
|
||||
"and the `metadata` key itself is removed.",
|
||||
)
|
||||
@click.option(
|
||||
"--fields-include",
|
||||
default="element_id,text,type,metadata",
|
||||
@ -345,7 +353,13 @@ def main(
|
||||
metadata_include,
|
||||
metadata_exclude,
|
||||
fields_include,
|
||||
flatten_metadata,
|
||||
):
|
||||
if flatten_metadata and "metadata" not in fields_include:
|
||||
logger.warning(
|
||||
"`--flatten-metadata` is specified, but there is no metadata to flatten, "
|
||||
"since `metadata` is not specified in `--fields-include`.",
|
||||
)
|
||||
if "metadata" not in fields_include and (metadata_include or metadata_exclude):
|
||||
logger.warning(
|
||||
"Either `--metadata-include` or `--metadata-exclude` is specified"
|
||||
@ -428,6 +442,7 @@ def main(
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
),
|
||||
)
|
||||
elif protocol in ("abfs", "az"):
|
||||
@ -451,6 +466,7 @@ def main(
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
),
|
||||
)
|
||||
else:
|
||||
@ -470,6 +486,7 @@ def main(
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
),
|
||||
)
|
||||
elif github_url:
|
||||
@ -487,6 +504,7 @@ def main(
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
),
|
||||
)
|
||||
elif gitlab_url:
|
||||
@ -504,6 +522,7 @@ def main(
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
),
|
||||
)
|
||||
elif subreddit_name:
|
||||
@ -523,6 +542,7 @@ def main(
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
),
|
||||
)
|
||||
elif wikipedia_page_title:
|
||||
@ -538,6 +558,7 @@ def main(
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
),
|
||||
)
|
||||
elif drive_id:
|
||||
@ -555,6 +576,7 @@ def main(
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
),
|
||||
)
|
||||
elif biomed_path or biomed_api_id or biomed_api_from or biomed_api_until:
|
||||
@ -572,6 +594,7 @@ def main(
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
),
|
||||
)
|
||||
# Check for other connector-specific options here and define the doc_connector object
|
||||
|
Loading…
x
Reference in New Issue
Block a user