mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 19:13:13 +00:00
fix: remove none value keys from flattened dictionary (#2442)
When a partitioned or embedded document json has null values, those get converted to a dictionary with None values. This happens in the metadata. I have not see it in other keys. Chroma and Pinecone do not like those None values. `flatten_dict` has been modified with a `remove_none` arg to remove keys with None values. Also, Pinecone has been pinned at 2.2.4 because at 3.0 and above it breaks our code. --------- Co-authored-by: potter-potter <david.potter@gmail.com>
This commit is contained in:
parent
a155e7a43b
commit
9fea85dc21
@ -1,4 +1,4 @@
|
||||
## 0.12.3-dev3
|
||||
## 0.12.3-dev4
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -12,6 +12,7 @@
|
||||
### Fixes
|
||||
* **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector.
|
||||
* **Fix databricks-volumes extra location.** `setup.py` is currently pointing to the wrong location for the databricks-volumes extra requirements. This results in errors when trying to build the wheel for unstructured. This change updates to point to the correct path.
|
||||
* **Fix uploading None values to Chroma and Pinecone.** Removes keys with None values with Pinecone and Chroma destinations. Pins Pinecone dependency
|
||||
|
||||
## 0.12.2
|
||||
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
-c ../constraints.in
|
||||
-c ../base.txt
|
||||
pinecone-client
|
||||
pinecone-client==2.2.4
|
||||
|
||||
@ -464,6 +464,34 @@ def test_flatten_dict_flatten_list_omit_keys():
|
||||
)
|
||||
|
||||
|
||||
def test_flatten_dict_flatten_list_omit_keys_remove_none():
|
||||
"""Flattening a dictionary with flatten_lists set to True and also omitting keys
|
||||
and setting remove_none to True"""
|
||||
dictionary = {"a": None, "b": [2, 3, 4], "c": {"d": None, "e": [6, 7]}}
|
||||
keys_to_omit = ["c"]
|
||||
expected_result = {"b_0": 2, "b_1": 3, "b_2": 4, "c": {"d": None, "e": [6, 7]}}
|
||||
assert (
|
||||
base.flatten_dict(
|
||||
dictionary, keys_to_omit=keys_to_omit, flatten_lists=True, remove_none=True
|
||||
)
|
||||
== expected_result
|
||||
)
|
||||
|
||||
|
||||
def test_flatten_dict_flatten_list_remove_none():
|
||||
"""Flattening a dictionary with flatten_lists set to True and setting remove_none to True"""
|
||||
dictionary = {"a": None, "b": [2, 3, 4], "c": {"d": None, "e": [6, 7]}}
|
||||
expected_result = {"b_0": 2, "b_1": 3, "b_2": 4, "c_e_0": 6, "c_e_1": 7}
|
||||
assert base.flatten_dict(dictionary, flatten_lists=True, remove_none=True) == expected_result
|
||||
|
||||
|
||||
def test_flatten_dict_flatten_list_none_in_list_remove_none():
|
||||
"""Flattening a dictionary with flatten_lists and remove_none set to True and None in list"""
|
||||
dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": None, "e": [6, None]}}
|
||||
expected_result = {"a": 1, "b_0": 2, "b_1": 3, "b_2": 4, "c_e_0": 6}
|
||||
assert base.flatten_dict(dictionary, flatten_lists=True, remove_none=True) == expected_result
|
||||
|
||||
|
||||
def test_flatten_dict_flatten_list_omit_keys2():
|
||||
"""Flattening a dictionary with flatten_lists set to True and also omitting keys"""
|
||||
dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": 5, "e": [6, 7]}}
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.12.3-dev3" # pragma: no cover
|
||||
__version__ = "0.12.3-dev4" # pragma: no cover
|
||||
|
||||
@ -151,5 +151,7 @@ class ChromaDestinationConnector(BaseDestinationConnector):
|
||||
"id": str(uuid.uuid4()),
|
||||
"embedding": element_dict.pop("embeddings", None),
|
||||
"document": element_dict.pop("text", None),
|
||||
"metadata": flatten_dict(element_dict, separator="-", flatten_lists=True),
|
||||
"metadata": flatten_dict(
|
||||
element_dict, separator="-", flatten_lists=True, remove_none=True
|
||||
),
|
||||
}
|
||||
|
||||
@ -135,6 +135,7 @@ class PineconeDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationC
|
||||
element_dict,
|
||||
separator="-",
|
||||
flatten_lists=True,
|
||||
remove_none=True,
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
@ -177,20 +177,30 @@ def elements_from_json(
|
||||
|
||||
|
||||
def flatten_dict(
|
||||
dictionary, parent_key="", separator="_", flatten_lists=False, keys_to_omit: List[str] = None
|
||||
dictionary,
|
||||
parent_key="",
|
||||
separator="_",
|
||||
flatten_lists=False,
|
||||
remove_none=False,
|
||||
keys_to_omit: List[str] = None,
|
||||
):
|
||||
"""Flattens a nested dictionary into a single level dictionary. keys_to_omit is a list of keys
|
||||
that don't get flattened. If omitting a nested key, format as {parent_key}{separator}{key}.
|
||||
If flatten_lists is True, then lists and tuples are flattened as well."""
|
||||
If flatten_lists is True, then lists and tuples are flattened as well.
|
||||
If remove_none is True, then None keys/values are removed from the flattened dictionary."""
|
||||
keys_to_omit = keys_to_omit if keys_to_omit else []
|
||||
flattened_dict = {}
|
||||
for key, value in dictionary.items():
|
||||
new_key = f"{parent_key}{separator}{key}" if parent_key else key
|
||||
if new_key in keys_to_omit:
|
||||
flattened_dict[new_key] = value
|
||||
elif value is None and remove_none:
|
||||
continue
|
||||
elif isinstance(value, dict):
|
||||
flattened_dict.update(
|
||||
flatten_dict(value, new_key, separator, flatten_lists, keys_to_omit=keys_to_omit),
|
||||
flatten_dict(
|
||||
value, new_key, separator, flatten_lists, remove_none, keys_to_omit=keys_to_omit
|
||||
),
|
||||
)
|
||||
elif isinstance(value, (list, tuple)) and flatten_lists:
|
||||
for index, item in enumerate(value):
|
||||
@ -200,6 +210,7 @@ def flatten_dict(
|
||||
"",
|
||||
separator,
|
||||
flatten_lists,
|
||||
remove_none,
|
||||
keys_to_omit=keys_to_omit,
|
||||
)
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user