fix: link_texts was breaking postgres destination connector (#2460)

Formatting of link_texts was breaking metadata storage. Turns out it
didn't need any conforming and came in correctly from json.

---------

Co-authored-by: potter-potter <david.potter@gmail.com>
This commit is contained in:
David Potter 2024-01-26 20:29:38 -08:00 committed by GitHub
parent d5a6f4b82c
commit 74dcca44ca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 47 additions and 5 deletions

View File

@ -1,4 +1,4 @@
## 0.12.3-dev6
## 0.12.3-dev7
### Enhancements
@ -17,6 +17,7 @@
* **Fix databricks-volumes extra location.** `setup.py` is currently pointing to the wrong location for the databricks-volumes extra requirements. This results in errors when trying to build the wheel for unstructured. This change updates to point to the correct path.
* **Fix uploading None values to Chroma and Pinecone.** Removes keys with None values with Pinecone and Chroma destinations. Pins Pinecone dependency
* **Update documentation.** (i) best practice for table extration by using 'skip_infer_table_types' param, instead of 'pdf_infer_table_structure', and (ii) fixed CSS, RST issues and typo in the documentation.
* **Fix postgres storage of link_texts.** Formatting of link_texts was breaking metadata storage.
## 0.12.2

View File

@ -60,6 +60,23 @@ TEST_DATA_2 = {
"embeddings": [0.1, 0.2, 0.3],
}
TEST_DATA_3 = {
"metadata": {
"coordinates": {"points": [1, 2, 3]},
"data_source": {
"date_created": "2021-01-01T00:00:00",
"date_modified": "2021-01-02T00:00:00",
"date_processed": "2022-12-13T15:44:08",
"version": 1.1,
},
"last_modified": "2021-01-03T00:00:00",
"page_number": 10,
"link_texts": ["Skip to main content"],
"link_urls": ["#main-content"],
},
"embeddings": [0.1, 0.2, 0.3],
}
def test_conform_dict_1():
"""Validate that the conform_dict method returns the expected output for a real example"""
@ -125,3 +142,30 @@ def test_conform_dict_2():
"version": "1.1",
"points": "[1, 2, 3]",
}
def test_conform_dict_link_texts():
"""Validate that the conform_dict method returns the expected output link_texts"""
# Create a mock instance of the connector class
connector = SqlDestinationConnector(write_config=Mock(), connector_config=Mock())
# Mock the uuid.uuid4 function to return a fixed value
with patch("uuid.uuid4", return_value="mocked_uuid"):
# Call the conform_dict method
data_out = TEST_DATA_3.copy()
connector.conform_dict(data_out)
# Assert that the result matches the expected output
assert data_out == {
"embeddings": "[0.1, 0.2, 0.3]",
"id": "mocked_uuid",
"last_modified": datetime.datetime(2021, 1, 3, 0, 0),
"link_texts": ["Skip to main content"],
"link_urls": ["#main-content"],
"page_number": "10",
"date_created": datetime.datetime(2021, 1, 1, 0, 0),
"date_modified": datetime.datetime(2021, 1, 2, 0, 0),
"date_processed": datetime.datetime(2022, 12, 13, 15, 44, 8),
"version": "1.1",
"points": "[1, 2, 3]",
}

View File

@ -1 +1 @@
__version__ = "0.12.3-dev6" # pragma: no cover
__version__ = "0.12.3-dev7" # pragma: no cover

View File

@ -127,9 +127,6 @@ class SqlDestinationConnector(BaseDestinationConnector):
):
data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
if link_texts := data.get("metadata", {}).get("link_texts", {}):
data["metadata"]["link_texts"] = str(json.dumps(link_texts))
if sent_from := data.get("metadata", {}).get("sent_from", {}):
data["metadata"]["sent_from"] = str(json.dumps(sent_from))