From 1c1faa474215a27de704c1a446f8da9150e5cddd Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 4 Jul 2022 10:12:31 +0200 Subject: [PATCH] Make check of document & embedding count optional in FAISS and Pinecone (#2677) * make validation optional & add method call in pinecone init * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/_src/api/api/document_store.md | 6 ++++-- haystack/document_stores/faiss.py | 5 ++++- haystack/document_stores/pinecone.py | 5 +++++ .../json-schemas/haystack-pipeline-master.schema.json | 10 ++++++++++ 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 65cf0b99f..0db018c27 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -2473,7 +2473,7 @@ the vector embeddings are indexed in a FAISS Index. #### FAISSDocumentStore.\_\_init\_\_ ```python -def __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, n_links: int = 64, ef_search: int = 20, ef_construction: int = 80) +def __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, n_links: int = 64, ef_search: int = 20, ef_construction: int = 80, validate_index_sync: bool = True) ``` **Arguments**: @@ -2523,6 +2523,7 @@ Can be created via calling `save()` - `n_links`: used only if index_factory == "HNSW" - `ef_search`: used only if index_factory == "HNSW" - `ef_construction`: used only if index_factory == "HNSW" +- `validate_index_sync`: Whether to check that the document count equals the embedding count at initialization time @@ -4672,7 +4673,7 @@ the vector embeddings and metadata (for filtering) are indexed in a Pinecone Ind #### PineconeDocumentStore.\_\_init\_\_ ```python -def __init__(api_key: str, environment: str = "us-west1-gcp", sql_url: str = "sqlite:///pinecone_document_store.db", pinecone_index: Optional[pinecone.Index] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", similarity: str = "cosine", replicas: int = 1, shards: int = 1, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", recreate_index: bool = False, metadata_config: dict = {"indexed": []}) +def __init__(api_key: str, environment: str = "us-west1-gcp", sql_url: str = "sqlite:///pinecone_document_store.db", pinecone_index: Optional[pinecone.Index] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", similarity: str = "cosine", replicas: int = 1, shards: int = 1, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", recreate_index: bool = False, metadata_config: dict = {"indexed": []}, validate_index_sync: bool = True) ``` **Arguments**: @@ -4709,6 +4710,7 @@ lost if you choose to recreate the index. Be aware that both the document_index be recreated. - `metadata_config`: Which metadata fields should be indexed. Should be in the format `{"indexed": ["metadata-field-1", "metadata-field-2", "metadata-field-n"]}`. +- `validate_index_sync`: Whether to check that the document count equals the embedding count at initialization time diff --git a/haystack/document_stores/faiss.py b/haystack/document_stores/faiss.py index 6ba8cef33..5aca2512f 100644 --- a/haystack/document_stores/faiss.py +++ b/haystack/document_stores/faiss.py @@ -60,6 +60,7 @@ class FAISSDocumentStore(SQLDocumentStore): n_links: int = 64, ef_search: int = 20, ef_construction: int = 80, + validate_index_sync: bool = True, ): """ :param sql_url: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale @@ -107,6 +108,7 @@ class FAISSDocumentStore(SQLDocumentStore): :param n_links: used only if index_factory == "HNSW" :param ef_search: used only if index_factory == "HNSW" :param ef_construction: used only if index_factory == "HNSW" + :param validate_index_sync: Whether to check that the document count equals the embedding count at initialization time """ # special case if we want to load an existing index from disk # load init params from disk and run init again @@ -162,7 +164,8 @@ class FAISSDocumentStore(SQLDocumentStore): url=sql_url, index=index, duplicate_documents=duplicate_documents, isolation_level=isolation_level ) - self._validate_index_sync() + if validate_index_sync: + self._validate_index_sync() def _validate_params_load_from_disk(self, sig: Signature, locals: dict): allowed_params = ["faiss_index_path", "faiss_config_path", "self"] diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 2f26d8e74..80980477c 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -53,6 +53,7 @@ class PineconeDocumentStore(SQLDocumentStore): duplicate_documents: str = "overwrite", recreate_index: bool = False, metadata_config: dict = {"indexed": []}, + validate_index_sync: bool = True, ): """ :param api_key: Pinecone vector database API key ([https://app.pinecone.io](https://app.pinecone.io)). @@ -88,6 +89,7 @@ class PineconeDocumentStore(SQLDocumentStore): be recreated. :param metadata_config: Which metadata fields should be indexed. Should be in the format `{"indexed": ["metadata-field-1", "metadata-field-2", "metadata-field-n"]}`. + :param validate_index_sync: Whether to check that the document count equals the embedding count at initialization time """ # Connect to Pinecone server using python client binding pinecone.init(api_key=api_key, environment=environment) @@ -141,6 +143,9 @@ class PineconeDocumentStore(SQLDocumentStore): metadata_config=self.metadata_config, ) + if validate_index_sync: + self._validate_index_sync() + def _sanitize_index_name(self, index: str) -> str: return index.replace("_", "-").lower() diff --git a/haystack/json-schemas/haystack-pipeline-master.schema.json b/haystack/json-schemas/haystack-pipeline-master.schema.json index b2490787d..a38490026 100644 --- a/haystack/json-schemas/haystack-pipeline-master.schema.json +++ b/haystack/json-schemas/haystack-pipeline-master.schema.json @@ -699,6 +699,11 @@ "title": "Ef Construction", "default": 80, "type": "integer" + }, + "validate_index_sync": { + "title": "Validate Index Sync", + "default": true, + "type": "boolean" } }, "additionalProperties": false, @@ -1537,6 +1542,11 @@ "indexed": [] }, "type": "object" + }, + "validate_index_sync": { + "title": "Validate Index Sync", + "default": true, + "type": "boolean" } }, "required": [