diff --git a/.semversioner/next-release/patch-20241029025329363730.json b/.semversioner/next-release/patch-20241029025329363730.json new file mode 100644 index 00000000..1e3995d6 --- /dev/null +++ b/.semversioner/next-release/patch-20241029025329363730.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "add backwards compatibility patch to vector store." +} diff --git a/.vscode/extensions.json b/.vscode/extensions.json index 6eb0e910..2e5e67a2 100644 --- a/.vscode/extensions.json +++ b/.vscode/extensions.json @@ -6,6 +6,7 @@ "ms-python.vscode-pylance", "bierner.markdown-mermaid", "streetsidesoftware.code-spell-checker", - "ronnidc.nunjucks" + "ronnidc.nunjucks", + "lucien-martijn.parquet-visualizer", ] } diff --git a/graphrag/api/query.py b/graphrag/api/query.py index 2897217c..d6d3bde3 100644 --- a/graphrag/api/query.py +++ b/graphrag/api/query.py @@ -182,18 +182,56 @@ async def local_search( ------ TODO: Document any exceptions to expect. """ - # TODO: must update filepath of lancedb (if used) until the new config engine has been implemented + #################################### BEGIN PATCH #################################### + # TODO: remove the following patch that checks for a vector_store prior to v1 release + # TODO: this is a backwards compatibility patch that injects the default vector_store settings into the config if it is not present + # Only applicable in situations involving a local vector_store (lancedb). The general idea: + # if vector_store not in config: + # 1. assume user is running local if vector_store is not in config + # 2. insert default vector_store in config + # 3 .create lancedb vector_store instance + # 4. upload vector embeddings from the input dataframes to the vector_store + backwards_compatible = False + if not config.embeddings.vector_store: + backwards_compatible = True + from graphrag.query.input.loaders.dfs import store_entity_semantic_embeddings + from graphrag.vector_stores.lancedb import LanceDBVectorStore + + config.embeddings.vector_store = { + "type": "lancedb", + "db_uri": f"{Path(config.storage.base_dir)}/lancedb", + "collection_name": "entity_description_embeddings", + "overwrite": True, + } + _entities = read_indexer_entities(nodes, entities, community_level) + description_embedding_store = LanceDBVectorStore( + db_uri=config.embeddings.vector_store["db_uri"], + collection_name=config.embeddings.vector_store["collection_name"], + overwrite=config.embeddings.vector_store["overwrite"], + ) + description_embedding_store.connect( + db_uri=config.embeddings.vector_store["db_uri"] + ) + # dump embeddings from the entities list to the description_embedding_store + store_entity_semantic_embeddings( + entities=_entities, vectorstore=description_embedding_store + ) + #################################### END PATCH #################################### + + # TODO: update filepath of lancedb (if used) until the new config engine has been implemented # TODO: remove the type ignore annotations below once the new config engine has been refactored vector_store_type = config.embeddings.vector_store.get("type") # type: ignore vector_store_args = config.embeddings.vector_store - if vector_store_type == "lancedb": + if vector_store_type == VectorStoreType.LanceDB and not backwards_compatible: db_uri = config.embeddings.vector_store["db_uri"] # type: ignore lancedb_dir = Path(config.root_dir).resolve() / db_uri vector_store_args["db_uri"] = str(lancedb_dir) # type: ignore + reporter.info(f"Vector Store Args: {redact(vector_store_args)}") # type: ignore - description_embedding_store = _get_embedding_description_store( - config_args=vector_store_args, # type: ignore - ) + if not backwards_compatible: # can remove this check and always set the description_embedding_store before v1 release + description_embedding_store = _get_embedding_description_store( + config_args=vector_store_args, # type: ignore + ) _entities = read_indexer_entities(nodes, entities, community_level) _covariates = read_indexer_covariates(covariates) if covariates is not None else [] @@ -205,7 +243,7 @@ async def local_search( entities=_entities, relationships=read_indexer_relationships(relationships), covariates={"claims": _covariates}, - description_embedding_store=description_embedding_store, + description_embedding_store=description_embedding_store, # type: ignore response_type=response_type, ) @@ -251,18 +289,56 @@ async def local_search_streaming( ------ TODO: Document any exceptions to expect. """ + #################################### BEGIN PATCH #################################### + # TODO: remove the following patch that checks for a vector_store prior to v1 release + # TODO: this is a backwards compatibility patch that injects the default vector_store settings into the config if it is not present + # Only applicable in situations involving a local vector_store (lancedb). The general idea: + # if vector_store not in config: + # 1. assume user is running local if vector_store is not in config + # 2. insert default vector_store in config + # 3 .create lancedb vector_store instance + # 4. upload vector embeddings from the input dataframes to the vector_store + backwards_compatible = False + if not config.embeddings.vector_store: + backwards_compatible = True + from graphrag.query.input.loaders.dfs import store_entity_semantic_embeddings + from graphrag.vector_stores.lancedb import LanceDBVectorStore + + config.embeddings.vector_store = { + "type": "lancedb", + "db_uri": f"{Path(config.storage.base_dir)}/lancedb", + "collection_name": "entity_description_embeddings", + "overwrite": True, + } + _entities = read_indexer_entities(nodes, entities, community_level) + description_embedding_store = LanceDBVectorStore( + db_uri=config.embeddings.vector_store["db_uri"], + collection_name=config.embeddings.vector_store["collection_name"], + overwrite=config.embeddings.vector_store["overwrite"], + ) + description_embedding_store.connect( + db_uri=config.embeddings.vector_store["db_uri"] + ) + # dump embeddings from the entities list to the description_embedding_store + store_entity_semantic_embeddings( + entities=_entities, vectorstore=description_embedding_store + ) + #################################### END PATCH #################################### + # TODO: must update filepath of lancedb (if used) until the new config engine has been implemented # TODO: remove the type ignore annotations below once the new config engine has been refactored - vector_store_type = config.embeddings.vector_store["type"] # type: ignore + vector_store_type = config.embeddings.vector_store.get("type") # type: ignore vector_store_args = config.embeddings.vector_store - if vector_store_type == VectorStoreType.LanceDB: + if vector_store_type == VectorStoreType.LanceDB and not backwards_compatible: db_uri = config.embeddings.vector_store["db_uri"] # type: ignore lancedb_dir = Path(config.root_dir).resolve() / db_uri vector_store_args["db_uri"] = str(lancedb_dir) # type: ignore + reporter.info(f"Vector Store Args: {redact(vector_store_args)}") # type: ignore - description_embedding_store = _get_embedding_description_store( - config_args=vector_store_args, # type: ignore - ) + if not backwards_compatible: # can remove this check and always set the description_embedding_store before v1 release + description_embedding_store = _get_embedding_description_store( + config_args=vector_store_args, # type: ignore + ) _entities = read_indexer_entities(nodes, entities, community_level) _covariates = read_indexer_covariates(covariates) if covariates is not None else [] @@ -274,7 +350,7 @@ async def local_search_streaming( entities=_entities, relationships=read_indexer_relationships(relationships), covariates={"claims": _covariates}, - description_embedding_store=description_embedding_store, + description_embedding_store=description_embedding_store, # type: ignore response_type=response_type, ) search_result = search_engine.astream_search(query=query) diff --git a/tests/fixtures/min-csv/settings.yml b/tests/fixtures/min-csv/settings.yml index 06543742..3b6954e2 100644 --- a/tests/fixtures/min-csv/settings.yml +++ b/tests/fixtures/min-csv/settings.yml @@ -6,12 +6,11 @@ embeddings: type: "lancedb" db_uri: "./tests/fixtures/min-csv/lancedb" collection_name: "lancedb_ci" + overwrite: True store_in_table: True - entity_name_description: title_column: "name" # id_column: "id" - # overwrite: true # entity_name: ... # relationship_description: ... # community_report_full_content: ... @@ -20,7 +19,6 @@ embeddings: # document_raw_content: ... # text_unit_text: ... - storage: type: file # or blob base_dir: "output/${timestamp}/artifacts"