From 9fea85dc210a52f0aaafecad97a43fd708a98304 Mon Sep 17 00:00:00 2001 From: David Potter Date: Tue, 23 Jan 2024 13:52:11 -0800 Subject: [PATCH] fix: remove none value keys from flattened dictionary (#2442) When a partitioned or embedded document json has null values, those get converted to a dictionary with None values. This happens in the metadata. I have not see it in other keys. Chroma and Pinecone do not like those None values. `flatten_dict` has been modified with a `remove_none` arg to remove keys with None values. Also, Pinecone has been pinned at 2.2.4 because at 3.0 and above it breaks our code. --------- Co-authored-by: potter-potter --- CHANGELOG.md | 3 +- requirements/ingest/pinecone.in | 2 +- .../staging/test_base_staging.py | 28 +++++++++++++++++++ unstructured/__version__.py | 2 +- unstructured/ingest/connector/chroma.py | 4 ++- unstructured/ingest/connector/pinecone.py | 1 + unstructured/staging/base.py | 17 +++++++++-- 7 files changed, 50 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 941c847c2..a1bc82865 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.12.3-dev3 +## 0.12.3-dev4 ### Enhancements @@ -12,6 +12,7 @@ ### Fixes * **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector. * **Fix databricks-volumes extra location.** `setup.py` is currently pointing to the wrong location for the databricks-volumes extra requirements. This results in errors when trying to build the wheel for unstructured. This change updates to point to the correct path. +* **Fix uploading None values to Chroma and Pinecone.** Removes keys with None values with Pinecone and Chroma destinations. Pins Pinecone dependency ## 0.12.2 diff --git a/requirements/ingest/pinecone.in b/requirements/ingest/pinecone.in index ebaedb531..d1cc814f8 100644 --- a/requirements/ingest/pinecone.in +++ b/requirements/ingest/pinecone.in @@ -1,3 +1,3 @@ -c ../constraints.in -c ../base.txt -pinecone-client +pinecone-client==2.2.4 diff --git a/test_unstructured/staging/test_base_staging.py b/test_unstructured/staging/test_base_staging.py index f113f1f3d..52f1410c1 100644 --- a/test_unstructured/staging/test_base_staging.py +++ b/test_unstructured/staging/test_base_staging.py @@ -464,6 +464,34 @@ def test_flatten_dict_flatten_list_omit_keys(): ) +def test_flatten_dict_flatten_list_omit_keys_remove_none(): + """Flattening a dictionary with flatten_lists set to True and also omitting keys + and setting remove_none to True""" + dictionary = {"a": None, "b": [2, 3, 4], "c": {"d": None, "e": [6, 7]}} + keys_to_omit = ["c"] + expected_result = {"b_0": 2, "b_1": 3, "b_2": 4, "c": {"d": None, "e": [6, 7]}} + assert ( + base.flatten_dict( + dictionary, keys_to_omit=keys_to_omit, flatten_lists=True, remove_none=True + ) + == expected_result + ) + + +def test_flatten_dict_flatten_list_remove_none(): + """Flattening a dictionary with flatten_lists set to True and setting remove_none to True""" + dictionary = {"a": None, "b": [2, 3, 4], "c": {"d": None, "e": [6, 7]}} + expected_result = {"b_0": 2, "b_1": 3, "b_2": 4, "c_e_0": 6, "c_e_1": 7} + assert base.flatten_dict(dictionary, flatten_lists=True, remove_none=True) == expected_result + + +def test_flatten_dict_flatten_list_none_in_list_remove_none(): + """Flattening a dictionary with flatten_lists and remove_none set to True and None in list""" + dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": None, "e": [6, None]}} + expected_result = {"a": 1, "b_0": 2, "b_1": 3, "b_2": 4, "c_e_0": 6} + assert base.flatten_dict(dictionary, flatten_lists=True, remove_none=True) == expected_result + + def test_flatten_dict_flatten_list_omit_keys2(): """Flattening a dictionary with flatten_lists set to True and also omitting keys""" dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": 5, "e": [6, 7]}} diff --git a/unstructured/__version__.py b/unstructured/__version__.py index e62434f7c..5b743183b 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.12.3-dev3" # pragma: no cover +__version__ = "0.12.3-dev4" # pragma: no cover diff --git a/unstructured/ingest/connector/chroma.py b/unstructured/ingest/connector/chroma.py index ae9912ec5..688f4f2da 100644 --- a/unstructured/ingest/connector/chroma.py +++ b/unstructured/ingest/connector/chroma.py @@ -151,5 +151,7 @@ class ChromaDestinationConnector(BaseDestinationConnector): "id": str(uuid.uuid4()), "embedding": element_dict.pop("embeddings", None), "document": element_dict.pop("text", None), - "metadata": flatten_dict(element_dict, separator="-", flatten_lists=True), + "metadata": flatten_dict( + element_dict, separator="-", flatten_lists=True, remove_none=True + ), } diff --git a/unstructured/ingest/connector/pinecone.py b/unstructured/ingest/connector/pinecone.py index e117043e1..dd6f5d023 100644 --- a/unstructured/ingest/connector/pinecone.py +++ b/unstructured/ingest/connector/pinecone.py @@ -135,6 +135,7 @@ class PineconeDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationC element_dict, separator="-", flatten_lists=True, + remove_none=True, ), }, } diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index 08f29e9d1..b7e17a065 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -177,20 +177,30 @@ def elements_from_json( def flatten_dict( - dictionary, parent_key="", separator="_", flatten_lists=False, keys_to_omit: List[str] = None + dictionary, + parent_key="", + separator="_", + flatten_lists=False, + remove_none=False, + keys_to_omit: List[str] = None, ): """Flattens a nested dictionary into a single level dictionary. keys_to_omit is a list of keys that don't get flattened. If omitting a nested key, format as {parent_key}{separator}{key}. - If flatten_lists is True, then lists and tuples are flattened as well.""" + If flatten_lists is True, then lists and tuples are flattened as well. + If remove_none is True, then None keys/values are removed from the flattened dictionary.""" keys_to_omit = keys_to_omit if keys_to_omit else [] flattened_dict = {} for key, value in dictionary.items(): new_key = f"{parent_key}{separator}{key}" if parent_key else key if new_key in keys_to_omit: flattened_dict[new_key] = value + elif value is None and remove_none: + continue elif isinstance(value, dict): flattened_dict.update( - flatten_dict(value, new_key, separator, flatten_lists, keys_to_omit=keys_to_omit), + flatten_dict( + value, new_key, separator, flatten_lists, remove_none, keys_to_omit=keys_to_omit + ), ) elif isinstance(value, (list, tuple)) and flatten_lists: for index, item in enumerate(value): @@ -200,6 +210,7 @@ def flatten_dict( "", separator, flatten_lists, + remove_none, keys_to_omit=keys_to_omit, ) )