Forbid usage of *args and **kwargs in any node's __init__ (#2362)

* Add failing test

* Remove `**kwargs` from docstores' `__init__` functions (#2407)

* Remove kwargs from ESDocStore subclasses

* Remove kwargs from subclasses of SQLDocumentStore

* Remove kwargs from Weaviate

* Revert change in pinecone

* Fix tests

* Fix retriever test wirh weaviate

* Change Exception into DocumentStoreError

* Update Documentation & Code Style

* Remove `**kwargs` from `FARMReader` (#2413)

* Remove FARMReader kwargs without trying to replace them functionally

* Update Documentation & Code Style

* enforce same index values before and after saving/loading eval dataframes (#2398)

* Add tests for missing `__init__` and `super().__init__()` in custom nodes (#2350)

* Add tests for missing init and super

* Update Documentation & Code Style

* change in with endswith

* Move test in pipeline.py and change test in pipeline_yaml.py

* Update Documentation & Code Style

* Use caplog to test the warning

* Update Documentation & Code Style

* move tests into test_pipeline and use get_config

* Update Documentation & Code Style

* Unmock version name

* Improve variadic args test

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
Sara Zan 2022-04-14 16:42:02 +02:00 committed by GitHub
parent 46a50fb979
commit 929c685cda
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 4849 additions and 75 deletions

View File

@ -414,7 +414,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore)
#### \_\_init\_\_
```python
def __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", use_system_proxy: bool = False)
def __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity: str = "dot_product", timeout: int = 30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", use_system_proxy: bool = False)
```
A DocumentStore using Elasticsearch to store and query the documents for our search.
@ -1231,7 +1231,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore)
#### \_\_init\_\_
```python
def __init__(verify_certs=False, scheme="https", username="admin", password="admin", port=9200, **kwargs)
def __init__(scheme: str = "https", username: str = "admin", password: str = "admin", host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", ca_certs: Optional[str] = None, verify_certs: bool = False, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity: str = "dot_product", timeout: int = 30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", use_system_proxy: bool = False)
```
Document Store using OpenSearch (https://opensearch.org/). It is compatible with the AWS Elasticsearch Service.
@ -2235,7 +2235,7 @@ the vector embeddings are indexed in a FAISS Index.
#### \_\_init\_\_
```python
def __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, **kwargs, ,)
def __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, n_links: int = 64, ef_search: int = 20, ef_construction: int = 80)
```
**Arguments**:
@ -2282,6 +2282,9 @@ If specified no other params besides faiss_config_path must be specified.
- `faiss_config_path`: Stored FAISS initial configuration parameters.
Can be created via calling `save()`
- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level)
- `n_links`: used only if index_factory == "HNSW"
- `ef_search`: used only if index_factory == "HNSW"
- `ef_construction`: used only if index_factory == "HNSW"
<a id="faiss.FAISSDocumentStore.write_documents"></a>
@ -2545,7 +2548,7 @@ Usage:
#### \_\_init\_\_
```python
def __init__(sql_url: str = "sqlite:///", milvus_url: str = "tcp://localhost:19530", connection_pool: str = "SingletonThread", index: str = "document", vector_dim: int = None, embedding_dim: int = 768, index_file_size: int = 1024, similarity: str = "dot_product", index_type: IndexType = IndexType.FLAT, index_param: Optional[Dict[str, Any]] = None, search_param: Optional[Dict[str, Any]] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", isolation_level: str = None, **kwargs, ,)
def __init__(sql_url: str = "sqlite:///", milvus_url: str = "tcp://localhost:19530", connection_pool: str = "SingletonThread", index: str = "document", vector_dim: int = None, embedding_dim: int = 768, index_file_size: int = 1024, similarity: str = "dot_product", index_type: IndexType = IndexType.FLAT, index_param: Optional[Dict[str, Any]] = None, search_param: Optional[Dict[str, Any]] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", isolation_level: str = None)
```
**Arguments**:
@ -3168,7 +3171,7 @@ The current implementation is not supporting the storage of labels, so you canno
#### \_\_init\_\_
```python
def __init__(host: Union[str, List[str]] = "http://localhost", port: Union[int, List[int]] = 8080, timeout_config: tuple = (5, 15), username: str = None, password: str = None, index: str = "Document", embedding_dim: int = 768, content_field: str = "content", name_field: str = "name", similarity: str = "cosine", index_type: str = "hnsw", custom_schema: Optional[dict] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", **kwargs, ,)
def __init__(host: Union[str, List[str]] = "http://localhost", port: Union[int, List[int]] = 8080, timeout_config: tuple = (5, 15), username: str = None, password: str = None, index: str = "Document", embedding_dim: int = 768, content_field: str = "content", name_field: str = "name", similarity: str = "cosine", index_type: str = "hnsw", custom_schema: Optional[dict] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite")
```
**Arguments**:

View File

@ -55,7 +55,7 @@ While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interf
#### \_\_init\_\_
```python
def __init__(model_name_or_path: str, model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, devices: List[torch.device] = [], no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True, confidence_threshold: Optional[float] = None, proxies: Optional[Dict[str, str]] = None, local_files_only=False, force_download=False, use_auth_token: Optional[Union[str, bool]] = None, **kwargs, ,)
def __init__(model_name_or_path: str, model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, devices: List[torch.device] = [], no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True, confidence_threshold: Optional[float] = None, proxies: Optional[Dict[str, str]] = None, local_files_only=False, force_download=False, use_auth_token: Optional[Union[str, bool]] = None)
```
**Arguments**:

View File

@ -23,6 +23,7 @@ from haystack.document_stores import KeywordDocumentStore
from haystack.schema import Document, Label
from haystack.document_stores.base import get_batches_from_generator
from haystack.document_stores.filter_utils import LogicalFilterClause
from haystack.errors import DocumentStoreError
logger = logging.getLogger(__name__)
@ -54,8 +55,8 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
recreate_index: bool = False,
create_index: bool = True,
refresh_type: str = "wait_for",
similarity="dot_product",
timeout=30,
similarity: str = "dot_product",
timeout: int = 30,
return_embedding: bool = False,
duplicate_documents: str = "overwrite",
index_type: str = "flat",
@ -179,9 +180,9 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
self.scroll = scroll
self.skip_missing_embeddings: bool = skip_missing_embeddings
if similarity in ["cosine", "dot_product", "l2"]:
self.similarity = similarity
self.similarity: str = similarity
else:
raise Exception(
raise DocumentStoreError(
f"Invalid value {similarity} for similarity in ElasticSearchDocumentStore constructor. Choose between 'cosine', 'l2' and 'dot_product'"
)
if index_type in ["flat", "hnsw"]:
@ -1592,7 +1593,42 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
class OpenSearchDocumentStore(ElasticsearchDocumentStore):
def __init__(self, verify_certs=False, scheme="https", username="admin", password="admin", port=9200, **kwargs):
def __init__(
self,
scheme: str = "https", # Mind this different default param
username: str = "admin", # Mind this different default param
password: str = "admin", # Mind this different default param
host: Union[str, List[str]] = "localhost",
port: Union[int, List[int]] = 9200,
api_key_id: Optional[str] = None,
api_key: Optional[str] = None,
aws4auth=None,
index: str = "document",
label_index: str = "label",
search_fields: Union[str, list] = "content",
content_field: str = "content",
name_field: str = "name",
embedding_field: str = "embedding",
embedding_dim: int = 768,
custom_mapping: Optional[dict] = None,
excluded_meta_data: Optional[list] = None,
analyzer: str = "standard",
ca_certs: Optional[str] = None,
verify_certs: bool = False, # Mind this different default param
recreate_index: bool = False,
create_index: bool = True,
refresh_type: str = "wait_for",
similarity: str = "dot_product",
timeout: int = 30,
return_embedding: bool = False,
duplicate_documents: str = "overwrite",
index_type: str = "flat",
scroll: str = "1d",
skip_missing_embeddings: bool = True,
synonyms: Optional[List] = None,
synonym_type: str = "synonym",
use_system_proxy: bool = False,
):
"""
Document Store using OpenSearch (https://opensearch.org/). It is compatible with the AWS Elasticsearch Service.
@ -1662,14 +1698,44 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process.
More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html
"""
super().__init__(
scheme=scheme,
username=username,
password=password,
host=host,
port=port,
api_key_id=api_key_id,
api_key=api_key,
aws4auth=aws4auth,
index=index,
label_index=label_index,
search_fields=search_fields,
content_field=content_field,
name_field=name_field,
embedding_field=embedding_field,
embedding_dim=embedding_dim,
custom_mapping=custom_mapping,
excluded_meta_data=excluded_meta_data,
analyzer=analyzer,
ca_certs=ca_certs,
verify_certs=verify_certs,
recreate_index=recreate_index,
create_index=create_index,
refresh_type=refresh_type,
similarity=similarity,
timeout=timeout,
return_embedding=return_embedding,
duplicate_documents=duplicate_documents,
index_type=index_type,
scroll=scroll,
skip_missing_embeddings=skip_missing_embeddings,
synonyms=synonyms,
synonym_type=synonym_type,
use_system_proxy=use_system_proxy,
)
self.embeddings_field_supports_similarity = False
self.similarity_to_space_type = {"cosine": "cosinesimil", "dot_product": "innerproduct", "l2": "l2"}
self.space_type_to_similarity = {v: k for k, v in self.similarity_to_space_type.items()}
# Overwrite default kwarg values of parent class so that in default cases we can initialize
# an OpenSearchDocumentStore without provding any arguments
super(OpenSearchDocumentStore, self).__init__(
verify_certs=verify_certs, scheme=scheme, username=username, password=password, port=port, **kwargs
)
def query_by_embedding(
self,
@ -1914,7 +1980,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
if not self.client.indices.exists(index=index_name, headers=headers):
raise e
def _get_embedding_field_mapping(self, similarity: Optional[str]):
def _get_embedding_field_mapping(self, similarity: str):
space_type = self.similarity_to_space_type[similarity]
method: dict = {"space_type": space_type, "name": "hnsw", "engine": "nmslib"}
@ -2049,10 +2115,79 @@ class OpenDistroElasticsearchDocumentStore(OpenSearchDocumentStore):
A DocumentStore which has an Open Distro for Elasticsearch service behind it.
"""
def __init__(self, similarity="cosine", **kwargs):
def __init__(
self,
scheme: str = "https",
username: str = "admin",
password: str = "admin",
host: Union[str, List[str]] = "localhost",
port: Union[int, List[int]] = 9200,
api_key_id: Optional[str] = None,
api_key: Optional[str] = None,
aws4auth=None,
index: str = "document",
label_index: str = "label",
search_fields: Union[str, list] = "content",
content_field: str = "content",
name_field: str = "name",
embedding_field: str = "embedding",
embedding_dim: int = 768,
custom_mapping: Optional[dict] = None,
excluded_meta_data: Optional[list] = None,
analyzer: str = "standard",
ca_certs: Optional[str] = None,
verify_certs: bool = False,
recreate_index: bool = False,
create_index: bool = True,
refresh_type: str = "wait_for",
similarity: str = "cosine", # Mind this different default param
timeout: int = 30,
return_embedding: bool = False,
duplicate_documents: str = "overwrite",
index_type: str = "flat",
scroll: str = "1d",
skip_missing_embeddings: bool = True,
synonyms: Optional[List] = None,
synonym_type: str = "synonym",
use_system_proxy: bool = False,
):
logger.warning(
"Open Distro for Elasticsearch has been replaced by OpenSearch! "
"See https://opensearch.org/faq/ for details. "
"We recommend using the OpenSearchDocumentStore instead."
)
super(OpenDistroElasticsearchDocumentStore, self).__init__(similarity=similarity, **kwargs)
super().__init__(
scheme=scheme,
username=username,
password=password,
host=host,
port=port,
api_key_id=api_key_id,
api_key=api_key,
aws4auth=aws4auth,
index=index,
label_index=label_index,
search_fields=search_fields,
content_field=content_field,
name_field=name_field,
embedding_field=embedding_field,
embedding_dim=embedding_dim,
custom_mapping=custom_mapping,
excluded_meta_data=excluded_meta_data,
analyzer=analyzer,
ca_certs=ca_certs,
verify_certs=verify_certs,
recreate_index=recreate_index,
create_index=create_index,
refresh_type=refresh_type,
similarity=similarity,
timeout=timeout,
return_embedding=return_embedding,
duplicate_documents=duplicate_documents,
index_type=index_type,
scroll=scroll,
skip_missing_embeddings=skip_missing_embeddings,
synonyms=synonyms,
synonym_type=synonym_type,
use_system_proxy=use_system_proxy,
)

View File

@ -57,7 +57,9 @@ class FAISSDocumentStore(SQLDocumentStore):
faiss_index_path: Union[str, Path] = None,
faiss_config_path: Union[str, Path] = None,
isolation_level: str = None,
**kwargs,
n_links: int = 64,
ef_search: int = 20,
ef_construction: int = 80,
):
"""
:param sql_url: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale
@ -102,12 +104,15 @@ class FAISSDocumentStore(SQLDocumentStore):
:param faiss_config_path: Stored FAISS initial configuration parameters.
Can be created via calling `save()`
:param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level)
:param n_links: used only if index_factory == "HNSW"
:param ef_search: used only if index_factory == "HNSW"
:param ef_construction: used only if index_factory == "HNSW"
"""
# special case if we want to load an existing index from disk
# load init params from disk and run init again
if faiss_index_path is not None:
sig = signature(self.__class__.__init__)
self._validate_params_load_from_disk(sig, locals(), kwargs)
self._validate_params_load_from_disk(sig, locals())
init_params = self._load_init_params_from_config(faiss_index_path, faiss_config_path)
self.__class__.__init__(self, **init_params) # pylint: disable=non-parent-init-called
return
@ -141,7 +146,9 @@ class FAISSDocumentStore(SQLDocumentStore):
embedding_dim=self.embedding_dim,
index_factory=faiss_index_factory_str,
metric_type=self.metric_type,
**kwargs,
n_links=n_links,
ef_search=ef_search,
ef_construction=ef_construction,
)
self.return_embedding = return_embedding
@ -155,8 +162,8 @@ class FAISSDocumentStore(SQLDocumentStore):
self._validate_index_sync()
def _validate_params_load_from_disk(self, sig: Signature, locals: dict, kwargs: dict):
allowed_params = ["faiss_index_path", "faiss_config_path", "self", "kwargs"]
def _validate_params_load_from_disk(self, sig: Signature, locals: dict):
allowed_params = ["faiss_index_path", "faiss_config_path", "self"]
invalid_param_set = False
for param in sig.parameters.values():
@ -164,7 +171,7 @@ class FAISSDocumentStore(SQLDocumentStore):
invalid_param_set = True
break
if invalid_param_set or len(kwargs) > 0:
if invalid_param_set:
raise ValueError("if faiss_index_path is passed no other params besides faiss_config_path are allowed.")
def _validate_index_sync(self):
@ -179,14 +186,21 @@ class FAISSDocumentStore(SQLDocumentStore):
"was used when creating the original index."
)
def _create_new_index(self, embedding_dim: int, metric_type, index_factory: str = "Flat", **kwargs):
def _create_new_index(
self,
embedding_dim: int,
metric_type,
index_factory: str = "Flat",
n_links: int = 64,
ef_search: int = 20,
ef_construction: int = 80,
):
if index_factory == "HNSW":
# faiss index factory doesn't give the same results for HNSW IP, therefore direct init.
# defaults here are similar to DPR codebase (good accuracy, but very high RAM consumption)
n_links = kwargs.get("n_links", 64)
index = faiss.IndexHNSWFlat(embedding_dim, n_links, metric_type)
index.hnsw.efSearch = kwargs.get("efSearch", 20) # 20
index.hnsw.efConstruction = kwargs.get("efConstruction", 80) # 80
index.hnsw.efSearch = ef_search
index.hnsw.efConstruction = ef_construction
if "ivf" in index_factory.lower(): # enable reconstruction of vectors for inverted index
self.faiss_indexes[index].set_direct_map_type(faiss.DirectMap.Hashtable)

View File

@ -60,7 +60,6 @@ class Milvus1DocumentStore(SQLDocumentStore):
progress_bar: bool = True,
duplicate_documents: str = "overwrite",
isolation_level: str = None,
**kwargs,
):
"""
:param sql_url: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale
@ -106,7 +105,9 @@ class Milvus1DocumentStore(SQLDocumentStore):
exists.
:param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level)
"""
super().__init__()
super().__init__(
url=sql_url, index=index, duplicate_documents=duplicate_documents, isolation_level=isolation_level
)
self.milvus_server = Milvus(uri=milvus_url, pool=connection_pool)
@ -141,10 +142,6 @@ class Milvus1DocumentStore(SQLDocumentStore):
self.embedding_field = embedding_field
self.progress_bar = progress_bar
super().__init__(
url=sql_url, index=index, duplicate_documents=duplicate_documents, isolation_level=isolation_level
)
def __del__(self):
return self.milvus_server.close()

View File

@ -126,7 +126,9 @@ class Milvus2DocumentStore(SQLDocumentStore):
exists.
:param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level)
"""
super().__init__()
super().__init__(
url=sql_url, index=index, duplicate_documents=duplicate_documents, isolation_level=isolation_level
)
connections.add_connection(default={"host": host, "port": port})
connections.connect()
@ -171,10 +173,6 @@ class Milvus2DocumentStore(SQLDocumentStore):
self.return_embedding = return_embedding
self.progress_bar = progress_bar
super().__init__(
url=sql_url, index=index, duplicate_documents=duplicate_documents, isolation_level=isolation_level
)
def _create_collection_and_index_if_not_exist(
self, index: Optional[str] = None, consistency_level: int = 0, index_param: Optional[Dict[str, Any]] = None
):

View File

@ -81,7 +81,6 @@ class PineconeDocumentStore(SQLDocumentStore):
- `"overwrite"`: Update any existing documents with the same ID when adding documents.
- `"fail"`: An error is raised if the document ID of the document being added already exists.
"""
# Connect to Pinecone server using python client binding
pinecone.init(api_key=api_key, environment=environment)
self._api_key = api_key
@ -129,8 +128,6 @@ class PineconeDocumentStore(SQLDocumentStore):
super().__init__(url=sql_url, index=clean_index, duplicate_documents=duplicate_documents)
# self._validate_index_sync()
def _sanitize_index_name(self, index: str) -> str:
return index.replace("_", "-").lower()

View File

@ -70,7 +70,6 @@ class WeaviateDocumentStore(BaseDocumentStore):
embedding_field: str = "embedding",
progress_bar: bool = True,
duplicate_documents: str = "overwrite",
**kwargs,
):
"""
:param host: Weaviate server connection URL for storing and processing documents and vectors.

View File

@ -15,9 +15,6 @@
},
{
"const": "1.3.0"
},
{
"const": "1.3.1rc0"
}
]
},

File diff suppressed because it is too large Load Diff

View File

@ -13,12 +13,6 @@
{
"const": "unstable"
},
{
"const": "1.2.1rc0"
},
{
"const": "1.3.0"
},
{
"const": "1.3.1rc0"
}
@ -470,11 +464,13 @@
},
"similarity": {
"title": "Similarity",
"default": "dot_product"
"default": "dot_product",
"type": "string"
},
"timeout": {
"title": "Timeout",
"default": 30
"default": 30,
"type": "integer"
},
"return_embedding": {
"title": "Return Embedding",
@ -626,6 +622,21 @@
"isolation_level": {
"title": "Isolation Level",
"type": "string"
},
"n_links": {
"title": "N Links",
"default": 64,
"type": "integer"
},
"ef_search": {
"title": "Ef Search",
"default": 20,
"type": "integer"
},
"ef_construction": {
"title": "Ef Construction",
"default": 80,
"type": "integer"
}
},
"additionalProperties": false,
@ -918,9 +929,192 @@
"title": "Parameters",
"type": "object",
"properties": {
"scheme": {
"title": "Scheme",
"default": "https",
"type": "string"
},
"username": {
"title": "Username",
"default": "admin",
"type": "string"
},
"password": {
"title": "Password",
"default": "admin",
"type": "string"
},
"host": {
"title": "Host",
"default": "localhost",
"anyOf": [
{
"type": "string"
},
{
"type": "array",
"items": {
"type": "string"
}
}
]
},
"port": {
"title": "Port",
"default": 9200,
"anyOf": [
{
"type": "integer"
},
{
"type": "array",
"items": {
"type": "integer"
}
}
]
},
"api_key_id": {
"title": "Api Key Id",
"type": "string"
},
"api_key": {
"title": "Api Key",
"type": "string"
},
"aws4auth": {
"title": "Aws4Auth"
},
"index": {
"title": "Index",
"default": "document",
"type": "string"
},
"label_index": {
"title": "Label Index",
"default": "label",
"type": "string"
},
"search_fields": {
"title": "Search Fields",
"default": "content",
"anyOf": [
{
"type": "string"
},
{
"type": "array",
"items": {}
}
]
},
"content_field": {
"title": "Content Field",
"default": "content",
"type": "string"
},
"name_field": {
"title": "Name Field",
"default": "name",
"type": "string"
},
"embedding_field": {
"title": "Embedding Field",
"default": "embedding",
"type": "string"
},
"embedding_dim": {
"title": "Embedding Dim",
"default": 768,
"type": "integer"
},
"custom_mapping": {
"title": "Custom Mapping",
"type": "object"
},
"excluded_meta_data": {
"title": "Excluded Meta Data",
"type": "array",
"items": {}
},
"analyzer": {
"title": "Analyzer",
"default": "standard",
"type": "string"
},
"ca_certs": {
"title": "Ca Certs",
"type": "string"
},
"verify_certs": {
"title": "Verify Certs",
"default": false,
"type": "boolean"
},
"recreate_index": {
"title": "Recreate Index",
"default": false,
"type": "boolean"
},
"create_index": {
"title": "Create Index",
"default": true,
"type": "boolean"
},
"refresh_type": {
"title": "Refresh Type",
"default": "wait_for",
"type": "string"
},
"similarity": {
"title": "Similarity",
"default": "cosine"
"default": "cosine",
"type": "string"
},
"timeout": {
"title": "Timeout",
"default": 30,
"type": "integer"
},
"return_embedding": {
"title": "Return Embedding",
"default": false,
"type": "boolean"
},
"duplicate_documents": {
"title": "Duplicate Documents",
"default": "overwrite",
"type": "string"
},
"index_type": {
"title": "Index Type",
"default": "flat",
"type": "string"
},
"scroll": {
"title": "Scroll",
"default": "1d",
"type": "string"
},
"skip_missing_embeddings": {
"title": "Skip Missing Embeddings",
"default": true,
"type": "boolean"
},
"synonyms": {
"title": "Synonyms",
"type": "array",
"items": {}
},
"synonym_type": {
"title": "Synonym Type",
"default": "synonym",
"type": "string"
},
"use_system_proxy": {
"title": "Use System Proxy",
"default": false,
"type": "boolean"
}
},
"additionalProperties": false,
@ -951,25 +1145,192 @@
"title": "Parameters",
"type": "object",
"properties": {
"verify_certs": {
"title": "Verify Certs",
"default": false
},
"scheme": {
"title": "Scheme",
"default": "https"
"default": "https",
"type": "string"
},
"username": {
"title": "Username",
"default": "admin"
"default": "admin",
"type": "string"
},
"password": {
"title": "Password",
"default": "admin"
"default": "admin",
"type": "string"
},
"host": {
"title": "Host",
"default": "localhost",
"anyOf": [
{
"type": "string"
},
{
"type": "array",
"items": {
"type": "string"
}
}
]
},
"port": {
"title": "Port",
"default": 9200
"default": 9200,
"anyOf": [
{
"type": "integer"
},
{
"type": "array",
"items": {
"type": "integer"
}
}
]
},
"api_key_id": {
"title": "Api Key Id",
"type": "string"
},
"api_key": {
"title": "Api Key",
"type": "string"
},
"aws4auth": {
"title": "Aws4Auth"
},
"index": {
"title": "Index",
"default": "document",
"type": "string"
},
"label_index": {
"title": "Label Index",
"default": "label",
"type": "string"
},
"search_fields": {
"title": "Search Fields",
"default": "content",
"anyOf": [
{
"type": "string"
},
{
"type": "array",
"items": {}
}
]
},
"content_field": {
"title": "Content Field",
"default": "content",
"type": "string"
},
"name_field": {
"title": "Name Field",
"default": "name",
"type": "string"
},
"embedding_field": {
"title": "Embedding Field",
"default": "embedding",
"type": "string"
},
"embedding_dim": {
"title": "Embedding Dim",
"default": 768,
"type": "integer"
},
"custom_mapping": {
"title": "Custom Mapping",
"type": "object"
},
"excluded_meta_data": {
"title": "Excluded Meta Data",
"type": "array",
"items": {}
},
"analyzer": {
"title": "Analyzer",
"default": "standard",
"type": "string"
},
"ca_certs": {
"title": "Ca Certs",
"type": "string"
},
"verify_certs": {
"title": "Verify Certs",
"default": false,
"type": "boolean"
},
"recreate_index": {
"title": "Recreate Index",
"default": false,
"type": "boolean"
},
"create_index": {
"title": "Create Index",
"default": true,
"type": "boolean"
},
"refresh_type": {
"title": "Refresh Type",
"default": "wait_for",
"type": "string"
},
"similarity": {
"title": "Similarity",
"default": "dot_product",
"type": "string"
},
"timeout": {
"title": "Timeout",
"default": 30,
"type": "integer"
},
"return_embedding": {
"title": "Return Embedding",
"default": false,
"type": "boolean"
},
"duplicate_documents": {
"title": "Duplicate Documents",
"default": "overwrite",
"type": "string"
},
"index_type": {
"title": "Index Type",
"default": "flat",
"type": "string"
},
"scroll": {
"title": "Scroll",
"default": "1d",
"type": "string"
},
"skip_missing_embeddings": {
"title": "Skip Missing Embeddings",
"default": true,
"type": "boolean"
},
"synonyms": {
"title": "Synonyms",
"type": "array",
"items": {}
},
"synonym_type": {
"title": "Synonym Type",
"default": "synonym",
"type": "string"
},
"use_system_proxy": {
"title": "Use System Proxy",
"default": false,
"type": "boolean"
}
},
"additionalProperties": false,

View File

@ -58,9 +58,6 @@
},
{
"const": "1.3.0"
},
{
"const": "1.3.1rc0"
}
]
}
@ -70,6 +67,24 @@
"$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/haystack-pipeline-1.2.1rc0.schema.json"
}
]
},
{
"allOf": [
{
"properties": {
"version": {
"oneOf": [
{
"const": "1.3.1rc0"
}
]
}
}
},
{
"$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/haystack-pipeline-1.3.1rc0.schema.json"
}
]
}
]
}

View File

@ -154,6 +154,13 @@ def create_schema_for_node_class(node_class: Type[BaseComponent]) -> Tuple[Dict[
raise PipelineSchemaError(f"Could not read the __init__ method of {node_name} to create its schema.")
signature = get_typed_signature(init_method)
# Check for variadic parameters (*args or **kwargs) and raise an exception if found
if any(param.kind in {param.VAR_POSITIONAL, param.VAR_KEYWORD} for param in signature.parameters.values()):
raise PipelineSchemaError(
"Nodes cannot use variadic parameters like *args or **kwargs in their __init__ function."
)
param_fields = [
param for param in signature.parameters.values() if param.kind not in {param.VAR_POSITIONAL, param.VAR_KEYWORD}
]

View File

@ -62,7 +62,6 @@ class FARMReader(BaseReader):
local_files_only=False,
force_download=False,
use_auth_token: Optional[Union[str, bool]] = None,
**kwargs,
):
"""
@ -140,7 +139,6 @@ class FARMReader(BaseReader):
force_download=force_download,
devices=self.devices,
use_auth_token=use_auth_token,
**kwargs,
)
self.inferencer.model.prediction_heads[0].context_window_size = context_window_size
self.inferencer.model.prediction_heads[0].no_ans_boost = no_ans_boost

View File

@ -842,9 +842,7 @@ def get_document_store(
)
elif document_store_type == "weaviate":
document_store = WeaviateDocumentStore(
weaviate_url="http://localhost:8080", index=index, similarity=similarity, embedding_dim=embedding_dim
)
document_store = WeaviateDocumentStore(index=index, similarity=similarity, embedding_dim=embedding_dim)
document_store.weaviate_client.schema.delete_all()
document_store._create_schema_and_index_if_not_exist()

View File

@ -667,6 +667,72 @@ def test_load_yaml_custom_component_with_superclass(tmp_path):
Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml")
def test_load_yaml_custom_component_with_variadic_args(tmp_path):
class BaseCustomNode(MockNode):
def __init__(self, base_parameter: int):
super().__init__()
self.base_parameter = base_parameter
class CustomNode(BaseCustomNode):
def __init__(self, some_parameter: str, *args):
super().__init__(*args)
self.some_parameter = some_parameter
with open(tmp_path / "tmp_config.yml", "w") as tmp_file:
tmp_file.write(
f"""
version: unstable
components:
- name: custom_node
type: CustomNode
params:
base_parameter: 1
some_parameter: value
pipelines:
- name: my_pipeline
nodes:
- name: custom_node
inputs:
- Query
"""
)
with pytest.raises(PipelineSchemaError, match="variadic"):
Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml")
def test_load_yaml_custom_component_with_variadic_kwargs(tmp_path):
class BaseCustomNode(MockNode):
def __init__(self, base_parameter: int):
super().__init__()
self.base_parameter = base_parameter
class CustomNode(BaseCustomNode):
def __init__(self, some_parameter: str, **kwargs):
super().__init__(**kwargs)
self.some_parameter = some_parameter
with open(tmp_path / "tmp_config.yml", "w") as tmp_file:
tmp_file.write(
f"""
version: unstable
components:
- name: custom_node
type: CustomNode
params:
base_parameter: 1
some_parameter: value
pipelines:
- name: my_pipeline
nodes:
- name: custom_node
inputs:
- Query
"""
)
with pytest.raises(PipelineSchemaError, match="variadic"):
Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml")
def test_load_yaml_no_pipelines(tmp_path):
with open(tmp_path / "tmp_config.yml", "w") as tmp_file:
tmp_file.write(

View File

@ -192,9 +192,7 @@ def test_retribert_embedding(document_store, retriever, docs):
if isinstance(document_store, WeaviateDocumentStore):
# Weaviate sets the embedding dimension to 768 as soon as it is initialized.
# We need 128 here and therefore initialize a new WeaviateDocumentStore.
document_store = WeaviateDocumentStore(
weaviate_url="http://localhost:8080", index="haystack_test", embedding_dim=128
)
document_store = WeaviateDocumentStore(index="haystack_test", embedding_dim=128)
document_store.weaviate_client.schema.delete_all()
document_store._create_schema_and_index_if_not_exist()
document_store.return_embedding = True