mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-07 00:52:42 +00:00

- Adds a destination connector to upload processed output into a PostgreSQL/Sqlite database instance. - Users are responsible to provide their instances. This PR includes a couple of configuration examples. - Defines the scripts required to setup a PostgreSQL instance with the unstructured elements schema. - Validates postgres/pgvector embedding storage and retrieval --------- Co-authored-by: potter-potter <david.potter@gmail.com>
126 lines
4.6 KiB
Python
126 lines
4.6 KiB
Python
import datetime
|
|
from unittest.mock import Mock, patch
|
|
|
|
from unstructured.ingest.connector.sql import SqlDestinationConnector
|
|
|
|
TEST_DATA_1 = {
|
|
"element_id": "80803034fe04181c163306740700cc54",
|
|
"metadata": {
|
|
"coordinates": {
|
|
"layout_height": 792,
|
|
"layout_width": 612,
|
|
"points": [
|
|
[72.0, 72.69200000000001],
|
|
[72.0, 83.69200000000001],
|
|
[135.8, 83.69200000000001],
|
|
[135.8, 72.69200000000001],
|
|
],
|
|
"system": "PixelSpace",
|
|
},
|
|
"data_source": {
|
|
"date_created": "2023-10-25 10:05:44.976775",
|
|
"date_modified": "2023-10-25 10:05:44.976775",
|
|
"date_processed": "2023-12-14T17:06:33.074057",
|
|
"permissions_data": [{"mode": 33188}],
|
|
"url": "example-docs/fake-memo.pdf",
|
|
},
|
|
"file_directory": "example-docs",
|
|
"filename": "fake-memo.pdf",
|
|
"filetype": "application/pdf",
|
|
"languages": ["eng"],
|
|
"last_modified": "2023-10-25T10:05:44",
|
|
"page_number": 1,
|
|
},
|
|
"text": "May 5, 2023",
|
|
"type": "UncategorizedText",
|
|
"embeddings": [
|
|
-0.05623878538608551,
|
|
0.008579030632972717,
|
|
0.03698136284947395,
|
|
-0.01745658740401268,
|
|
-0.030465232208371162,
|
|
0.00996527448296547,
|
|
],
|
|
}
|
|
|
|
TEST_DATA_2 = {
|
|
"metadata": {
|
|
"coordinates": {"points": [1, 2, 3]},
|
|
"links": {"link1": "https://example.com", "link2": "https://example.org"},
|
|
"data_source": {
|
|
"date_created": "2021-01-01T00:00:00",
|
|
"date_modified": "2021-01-02T00:00:00",
|
|
"date_processed": "2022-12-13T15:44:08",
|
|
"version": 1.1,
|
|
},
|
|
"last_modified": "2021-01-03T00:00:00",
|
|
"page_number": 10,
|
|
"regex_metadata": {"pattern": "abc"},
|
|
},
|
|
"embeddings": [0.1, 0.2, 0.3],
|
|
}
|
|
|
|
|
|
def test_conform_dict_1():
|
|
"""Validate that the conform_dict method returns the expected output for a real example"""
|
|
# Create a mock instance of the connector class
|
|
connector = SqlDestinationConnector(write_config=Mock(), connector_config=Mock())
|
|
|
|
# Mock the uuid.uuid4 function to return a fixed value
|
|
with patch("uuid.uuid4", return_value="mocked_uuid"):
|
|
# Call the conform_dict method
|
|
data_out = connector.conform_dict(TEST_DATA_1)
|
|
|
|
# Assert that the result matches the expected output
|
|
assert data_out == {
|
|
"element_id": "80803034fe04181c163306740700cc54",
|
|
"text": "May 5, 2023",
|
|
"type": "UncategorizedText",
|
|
"id": "mocked_uuid",
|
|
"file_directory": "example-docs",
|
|
"filename": "fake-memo.pdf",
|
|
"filetype": "application/pdf",
|
|
"languages": ["eng"],
|
|
"last_modified": datetime.datetime(2023, 10, 25, 10, 5, 44),
|
|
"page_number": "1",
|
|
"date_created": datetime.datetime(2023, 10, 25, 10, 5, 44, 976775),
|
|
"date_modified": datetime.datetime(2023, 10, 25, 10, 5, 44, 976775),
|
|
"date_processed": datetime.datetime(2023, 12, 14, 17, 6, 33, 74057),
|
|
"permissions_data": '[{"mode": 33188}]',
|
|
"url": "example-docs/fake-memo.pdf",
|
|
"layout_height": 792,
|
|
"layout_width": 612,
|
|
"points": "[[72.0, 72.69200000000001], [72.0, 83.69200000000001],"
|
|
" [135.8, 83.69200000000001], [135.8, 72.69200000000001]]",
|
|
"system": "PixelSpace",
|
|
"embeddings": "[-0.05623878538608551, 0.008579030632972717, "
|
|
"0.03698136284947395, -0.01745658740401268, "
|
|
"-0.030465232208371162, 0.00996527448296547]",
|
|
}
|
|
|
|
|
|
def test_conform_dict_2():
|
|
"""Validate that the conform_dict method returns the expected output for a simplified example"""
|
|
# Create a mock instance of the connector class
|
|
connector = SqlDestinationConnector(write_config=Mock(), connector_config=Mock())
|
|
|
|
# Mock the uuid.uuid4 function to return a fixed value
|
|
with patch("uuid.uuid4", return_value="mocked_uuid"):
|
|
# Call the conform_dict method
|
|
data_out = connector.conform_dict(TEST_DATA_2)
|
|
|
|
# Assert that the result matches the expected output
|
|
assert data_out == {
|
|
"embeddings": "[0.1, 0.2, 0.3]",
|
|
"id": "mocked_uuid",
|
|
"links": '{"link1": "https://example.com", "link2": "https://example.org"}',
|
|
"last_modified": datetime.datetime(2021, 1, 3, 0, 0),
|
|
"page_number": "10",
|
|
"regex_metadata": '{"pattern": "abc"}',
|
|
"date_created": datetime.datetime(2021, 1, 1, 0, 0),
|
|
"date_modified": datetime.datetime(2021, 1, 2, 0, 0),
|
|
"date_processed": datetime.datetime(2022, 12, 13, 15, 44, 8),
|
|
"version": "1.1",
|
|
"points": "[1, 2, 3]",
|
|
}
|