From a9ad8ac8d13da38a6fc7c05bbe83c169cecc2be7 Mon Sep 17 00:00:00 2001 From: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com> Date: Fri, 19 Jan 2024 00:21:22 +0000 Subject: [PATCH] fix: update flatten dict to support flattening tuples (#2423) This PR updates flatten_dict function to support flattening tuples. This is necessary for objects like Coordinates, when the object is not written to the disk, therefore not being converted to a list before getting flattened. --- CHANGELOG.md | 1 + test_unstructured/staging/test_base_staging.py | 14 ++++++++++++++ unstructured/staging/base.py | 4 ++-- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27187ad00..2ae6e13f4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ * **Fix the serialization of the Elasticsearch destination connector.** Presence of the _client object breaks serialization due to TypeError: cannot pickle '_thread.lock' object. This removes that object before serialization. * **Fix the serialization of the Postgres destination connector.** Presence of the _client object breaks serialization due to TypeError: cannot pickle '_thread.lock' object. This removes that object before serialization. * **Fix documentation and sample code for Chroma.** Was pointing to wrong examples.. +* **Fix flatten_dict to be able to flatten tuples inside dicts** Update flatten_dict function to support flattening tuples inside dicts. This is necessary for objects like Coordinates, when the object is not written to the disk, therefore not being converted to a list before getting flattened (still being a tuple). ## 0.12.0 diff --git a/test_unstructured/staging/test_base_staging.py b/test_unstructured/staging/test_base_staging.py index 741d9b676..f113f1f3d 100644 --- a/test_unstructured/staging/test_base_staging.py +++ b/test_unstructured/staging/test_base_staging.py @@ -409,6 +409,13 @@ def test_flatten_nested_dict(): assert base.flatten_dict(dictionary) == expected_result +def test_flatten_dict_with_tuples(): + """Flattening a dictionary with tuples""" + dictionary = {"a": 1, "b": (2, 3, 4), "c": {"d": 5, "e": (6, 7)}} + expected_result = {"a": 1, "b": (2, 3, 4), "c_d": 5, "c_e": (6, 7)} + assert base.flatten_dict(dictionary) == expected_result + + def test_flatten_dict_with_lists(): """Flattening a dictionary with lists""" dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": 5, "e": [6, 7]}} @@ -432,6 +439,13 @@ def test_flatten_dict_alt_separator(): assert base.flatten_dict(dictionary, separator=separator) == expected_result +def test_flatten_dict_flatten_tuple(): + """Flattening a dictionary with flatten_lists set to True, to flatten tuples""" + dictionary = {"a": 1, "b": (2, 3, 4), "c": {"d": 5, "e": (6, 7)}} + expected_result = {"a": 1, "b_0": 2, "b_1": 3, "b_2": 4, "c_d": 5, "c_e_0": 6, "c_e_1": 7} + assert base.flatten_dict(dictionary, flatten_lists=True) == expected_result + + def test_flatten_dict_flatten_list(): """Flattening a dictionary with flatten_lists set to True""" dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": 5, "e": [6, 7]}} diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index 2056c12f6..08f29e9d1 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -181,7 +181,7 @@ def flatten_dict( ): """Flattens a nested dictionary into a single level dictionary. keys_to_omit is a list of keys that don't get flattened. If omitting a nested key, format as {parent_key}{separator}{key}. - If flatten_lists is True, then lists are flattened as well.""" + If flatten_lists is True, then lists and tuples are flattened as well.""" keys_to_omit = keys_to_omit if keys_to_omit else [] flattened_dict = {} for key, value in dictionary.items(): @@ -192,7 +192,7 @@ def flatten_dict( flattened_dict.update( flatten_dict(value, new_key, separator, flatten_lists, keys_to_omit=keys_to_omit), ) - elif isinstance(value, list) and flatten_lists: + elif isinstance(value, (list, tuple)) and flatten_lists: for index, item in enumerate(value): flattened_dict.update( flatten_dict(