fix: remove none value keys from flattened dictionary (#2442)

When a partitioned or embedded document json has null values, those get
converted to a dictionary with None values.

This happens in the metadata. I have not see it in other keys.

Chroma and Pinecone do not like those None values. 

`flatten_dict` has been modified with a `remove_none` arg to remove keys
with None values.

Also, Pinecone has been pinned at 2.2.4 because at 3.0 and above it
breaks our code.

---------

Co-authored-by: potter-potter <david.potter@gmail.com>
This commit is contained in:
David Potter 2024-01-23 13:52:11 -08:00 committed by GitHub
parent a155e7a43b
commit 9fea85dc21
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 50 additions and 7 deletions

View File

@ -1,4 +1,4 @@
## 0.12.3-dev3
## 0.12.3-dev4
### Enhancements
@ -12,6 +12,7 @@
### Fixes
* **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector.
* **Fix databricks-volumes extra location.** `setup.py` is currently pointing to the wrong location for the databricks-volumes extra requirements. This results in errors when trying to build the wheel for unstructured. This change updates to point to the correct path.
* **Fix uploading None values to Chroma and Pinecone.** Removes keys with None values with Pinecone and Chroma destinations. Pins Pinecone dependency
## 0.12.2

View File

@ -1,3 +1,3 @@
-c ../constraints.in
-c ../base.txt
pinecone-client
pinecone-client==2.2.4

View File

@ -464,6 +464,34 @@ def test_flatten_dict_flatten_list_omit_keys():
)
def test_flatten_dict_flatten_list_omit_keys_remove_none():
"""Flattening a dictionary with flatten_lists set to True and also omitting keys
and setting remove_none to True"""
dictionary = {"a": None, "b": [2, 3, 4], "c": {"d": None, "e": [6, 7]}}
keys_to_omit = ["c"]
expected_result = {"b_0": 2, "b_1": 3, "b_2": 4, "c": {"d": None, "e": [6, 7]}}
assert (
base.flatten_dict(
dictionary, keys_to_omit=keys_to_omit, flatten_lists=True, remove_none=True
)
== expected_result
)
def test_flatten_dict_flatten_list_remove_none():
"""Flattening a dictionary with flatten_lists set to True and setting remove_none to True"""
dictionary = {"a": None, "b": [2, 3, 4], "c": {"d": None, "e": [6, 7]}}
expected_result = {"b_0": 2, "b_1": 3, "b_2": 4, "c_e_0": 6, "c_e_1": 7}
assert base.flatten_dict(dictionary, flatten_lists=True, remove_none=True) == expected_result
def test_flatten_dict_flatten_list_none_in_list_remove_none():
"""Flattening a dictionary with flatten_lists and remove_none set to True and None in list"""
dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": None, "e": [6, None]}}
expected_result = {"a": 1, "b_0": 2, "b_1": 3, "b_2": 4, "c_e_0": 6}
assert base.flatten_dict(dictionary, flatten_lists=True, remove_none=True) == expected_result
def test_flatten_dict_flatten_list_omit_keys2():
"""Flattening a dictionary with flatten_lists set to True and also omitting keys"""
dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": 5, "e": [6, 7]}}

View File

@ -1 +1 @@
__version__ = "0.12.3-dev3" # pragma: no cover
__version__ = "0.12.3-dev4" # pragma: no cover

View File

@ -151,5 +151,7 @@ class ChromaDestinationConnector(BaseDestinationConnector):
"id": str(uuid.uuid4()),
"embedding": element_dict.pop("embeddings", None),
"document": element_dict.pop("text", None),
"metadata": flatten_dict(element_dict, separator="-", flatten_lists=True),
"metadata": flatten_dict(
element_dict, separator="-", flatten_lists=True, remove_none=True
),
}

View File

@ -135,6 +135,7 @@ class PineconeDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationC
element_dict,
separator="-",
flatten_lists=True,
remove_none=True,
),
},
}

View File

@ -177,20 +177,30 @@ def elements_from_json(
def flatten_dict(
dictionary, parent_key="", separator="_", flatten_lists=False, keys_to_omit: List[str] = None
dictionary,
parent_key="",
separator="_",
flatten_lists=False,
remove_none=False,
keys_to_omit: List[str] = None,
):
"""Flattens a nested dictionary into a single level dictionary. keys_to_omit is a list of keys
that don't get flattened. If omitting a nested key, format as {parent_key}{separator}{key}.
If flatten_lists is True, then lists and tuples are flattened as well."""
If flatten_lists is True, then lists and tuples are flattened as well.
If remove_none is True, then None keys/values are removed from the flattened dictionary."""
keys_to_omit = keys_to_omit if keys_to_omit else []
flattened_dict = {}
for key, value in dictionary.items():
new_key = f"{parent_key}{separator}{key}" if parent_key else key
if new_key in keys_to_omit:
flattened_dict[new_key] = value
elif value is None and remove_none:
continue
elif isinstance(value, dict):
flattened_dict.update(
flatten_dict(value, new_key, separator, flatten_lists, keys_to_omit=keys_to_omit),
flatten_dict(
value, new_key, separator, flatten_lists, remove_none, keys_to_omit=keys_to_omit
),
)
elif isinstance(value, (list, tuple)) and flatten_lists:
for index, item in enumerate(value):
@ -200,6 +210,7 @@ def flatten_dict(
"",
separator,
flatten_lists,
remove_none,
keys_to_omit=keys_to_omit,
)
)