diff --git a/.github/utils/tutorials.sh b/.github/utils/tutorials.sh index 9265b27f8..10a7ee9c3 100755 --- a/.github/utils/tutorials.sh +++ b/.github/utils/tutorials.sh @@ -39,7 +39,7 @@ done # Run the containers docker run -d -p 9200:9200 --name elasticsearch -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2 -docker run -d -p 9998:9998 --name tika -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.24.1 +docker run -d -p 9998:9998 --name tika -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.28.4 failed="" diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 87dd3ca63..473a7d266 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -656,7 +656,7 @@ jobs: - name: Run Opensearch run: | - docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.2.4 + docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.3.5 - name: Run Milvus run: | @@ -672,7 +672,7 @@ jobs: run: docker run -d -p 7200:7200 --name haystack_test_graphdb deepset/graphdb-free:9.4.1-adoptopenjdk11 - name: Run Apache Tika - run: docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.24.1 + run: docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.28.4 - name: Run Parsr run: docker run -d -p 3001:3001 axarev/parsr:v1.2.2 diff --git a/.github/workflows/tutorials_nightly.yml b/.github/workflows/tutorials_nightly.yml index 22e8fdadf..37fe5fa9b 100644 --- a/.github/workflows/tutorials_nightly.yml +++ b/.github/workflows/tutorials_nightly.yml @@ -28,7 +28,7 @@ jobs: run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2 - name: Run Apache Tika - run: docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.24.1 + run: docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.28.4 - name: Run GraphDB run: docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 77b48b7ed..e3093d539 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -199,7 +199,7 @@ docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_AN docker run -d -p 7200:7200 --name haystack_test_graphdb deepset/graphdb-free:9.4.1-adoptopenjdk11 # Tika -docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.24.1 +docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.28.4 ``` Tests can be also run **individually**: diff --git a/docs/_src/api/api/crawler.md b/docs/_src/api/api/crawler.md index d2e9b08c9..cdab2adef 100644 --- a/docs/_src/api/api/crawler.md +++ b/docs/_src/api/api/crawler.md @@ -27,7 +27,17 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus #### Crawler.\_\_init\_\_ ```python -def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, webdriver_options: Optional[List[str]] = None) +def __init__(output_dir: str, + urls: Optional[List[str]] = None, + crawler_depth: int = 1, + filter_urls: Optional[List] = None, + overwrite_existing_files=True, + id_hash_keys: Optional[List[str]] = None, + extract_hidden_text=True, + loading_wait_time: Optional[int] = None, + crawler_naming_function: Optional[Callable[[str, str], + str]] = None, + webdriver_options: Optional[List[str]] = None) ``` Init object with basic params for crawling (can be overwritten later). @@ -73,7 +83,17 @@ See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html# #### Crawler.crawl ```python -def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> List[Path] +def crawl( + output_dir: Union[str, Path, None] = None, + urls: Optional[List[str]] = None, + crawler_depth: Optional[int] = None, + filter_urls: Optional[List] = None, + overwrite_existing_files: Optional[bool] = None, + id_hash_keys: Optional[List[str]] = None, + extract_hidden_text: Optional[bool] = None, + loading_wait_time: Optional[int] = None, + crawler_naming_function: Optional[Callable[[str, str], str]] = None +) -> List[Path] ``` Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON @@ -116,7 +136,18 @@ List of paths where the crawled webpages got stored #### Crawler.run ```python -def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str] +def run( + output_dir: Union[str, Path, None] = None, + urls: Optional[List[str]] = None, + crawler_depth: Optional[int] = None, + filter_urls: Optional[List] = None, + overwrite_existing_files: Optional[bool] = None, + return_documents: Optional[bool] = False, + id_hash_keys: Optional[List[str]] = None, + extract_hidden_text: Optional[bool] = True, + loading_wait_time: Optional[int] = None, + crawler_naming_function: Optional[Callable[[str, str], str]] = None +) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str] ``` Method to be executed when the Crawler is used as a Node within a Haystack pipeline. diff --git a/docs/_src/api/api/document_classifier.md b/docs/_src/api/api/document_classifier.md index 629bbb568..acae37f8a 100644 --- a/docs/_src/api/api/document_classifier.md +++ b/docs/_src/api/api/document_classifier.md @@ -84,7 +84,19 @@ With this document_classifier, you can directly get predictions via predict() #### TransformersDocumentClassifier.\_\_init\_\_ ```python -def __init__(model_name_or_path: str = "bhadresh-savani/distilbert-base-uncased-emotion", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, return_all_scores: bool = False, task: str = "text-classification", labels: Optional[List[str]] = None, batch_size: int = 16, classification_field: str = None, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) +def __init__(model_name_or_path: + str = "bhadresh-savani/distilbert-base-uncased-emotion", + model_version: Optional[str] = None, + tokenizer: Optional[str] = None, + use_gpu: bool = True, + return_all_scores: bool = False, + task: str = "text-classification", + labels: Optional[List[str]] = None, + batch_size: int = 16, + classification_field: str = None, + progress_bar: bool = True, + use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None) ``` Load a text classification model from Transformers. @@ -132,7 +144,8 @@ parameter is not used and a single cpu device is used for inference. #### TransformersDocumentClassifier.predict ```python -def predict(documents: List[Document], batch_size: Optional[int] = None) -> List[Document] +def predict(documents: List[Document], + batch_size: Optional[int] = None) -> List[Document] ``` Returns documents containing classification result in a meta field. @@ -153,7 +166,10 @@ A list of Documents enriched with meta information. #### TransformersDocumentClassifier.predict\_batch ```python -def predict_batch(documents: Union[List[Document], List[List[Document]]], batch_size: Optional[int] = None) -> Union[List[Document], List[List[Document]]] +def predict_batch( + documents: Union[List[Document], List[List[Document]]], + batch_size: Optional[int] = None +) -> Union[List[Document], List[List[Document]]] ``` Returns documents containing classification result in meta field. diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 53aa056f7..aec2bdc88 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -28,7 +28,11 @@ Base class for implementing Document Stores. ```python @abstractmethod -def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +def write_documents(documents: Union[List[dict], List[Document]], + index: Optional[str] = None, + batch_size: int = 10_000, + duplicate_documents: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) ``` Indexes documents for later queries. @@ -61,7 +65,13 @@ None ```python @abstractmethod -def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_all_documents( + index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Get documents from the document store. @@ -106,7 +116,14 @@ operation. ```python @abstractmethod -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator( + index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, + str]] = None) -> Generator[Document, None, None] ``` Get documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -153,7 +170,15 @@ filters = { #### BaseDocumentStore.get\_all\_labels\_aggregated ```python -def get_all_labels_aggregated(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, open_domain: bool = True, drop_negative_labels: bool = False, drop_no_answers: bool = False, aggregate_by_meta: Optional[Union[str, list]] = None, headers: Optional[Dict[str, str]] = None) -> List[MultiLabel] +def get_all_labels_aggregated( + index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + open_domain: bool = True, + drop_negative_labels: bool = False, + drop_no_answers: bool = False, + aggregate_by_meta: Optional[Union[str, list]] = None, + headers: Optional[Dict[str, str]] = None) -> List[MultiLabel] ``` Return all labels in the DocumentStore, aggregated into MultiLabel objects. @@ -223,7 +248,14 @@ Performs L2 normalization of embeddings vector inplace. Input can be a single ve #### BaseDocumentStore.add\_eval\_data ```python -def add_eval_data(filename: str, doc_index: str = "eval_document", label_index: str = "label", batch_size: Optional[int] = None, preprocessor: Optional[PreProcessor] = None, max_docs: Union[int, bool] = None, open_domain: bool = False, headers: Optional[Dict[str, str]] = None) +def add_eval_data(filename: str, + doc_index: str = "eval_document", + label_index: str = "label", + batch_size: Optional[int] = None, + preprocessor: Optional[PreProcessor] = None, + max_docs: Union[int, bool] = None, + open_domain: bool = False, + headers: Optional[Dict[str, str]] = None) ``` Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it. @@ -272,7 +304,10 @@ None #### BaseDocumentStore.run ```python -def run(documents: List[Union[dict, Document]], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, id_hash_keys: Optional[List[str]] = None) +def run(documents: List[Union[dict, Document]], + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + id_hash_keys: Optional[List[str]] = None) ``` Run requests of document stores @@ -315,7 +350,15 @@ Base class for implementing Document Stores that support keyword searches. ```python @abstractmethod -def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False, scale_score: bool = True) -> List[Document] +def query(query: Optional[str], + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + top_k: int = 10, + custom_query: Optional[str] = None, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + all_terms_must_match: bool = False, + scale_score: bool = True) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -406,12 +449,19 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. ```python @abstractmethod -def query_batch(queries: List[str], filters: Optional[ - Union[ - Dict[str, Union[Dict, List, str, int, float, bool]], - List[Dict[str, Union[Dict, List, str, int, float, bool]]], - ] - ] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False, scale_score: bool = True) -> List[List[Document]] +def query_batch(queries: List[str], + filters: Optional[Union[Dict[str, Union[Dict, List, str, int, + float, bool]], + List[Dict[str, + Union[Dict, List, str, int, + float, + bool]]], ]] = None, + top_k: int = 10, + custom_query: Optional[str] = None, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + all_terms_must_match: bool = False, + scale_score: bool = True) -> List[List[Document]] ``` Scan through documents in DocumentStore and return a small number documents @@ -539,7 +589,10 @@ Base class implementing the common logic for Elasticsearch and Opensearch #### BaseElasticsearchDocumentStore.get\_document\_by\_id ```python -def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +def get_document_by_id( + id: str, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) -> Optional[Document] ``` Fetch a document by specifying its text id string @@ -549,7 +602,11 @@ Fetch a document by specifying its text id string #### BaseElasticsearchDocumentStore.get\_documents\_by\_id ```python -def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_documents_by_id( + ids: List[str], + index: Optional[str] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Fetch documents by specifying a list of text id strings. Be aware that passing a large number of ids might lead @@ -560,7 +617,13 @@ to performance issues. Note that Elasticsearch limits the number of results to 1 #### BaseElasticsearchDocumentStore.get\_metadata\_values\_by\_key ```python -def get_metadata_values_by_key(key: str, query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[dict] +def get_metadata_values_by_key( + key: str, + query: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) -> List[dict] ``` Get values associated with a metadata key. The output is in the format: @@ -606,7 +669,11 @@ Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-c #### BaseElasticsearchDocumentStore.write\_documents ```python -def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +def write_documents(documents: Union[List[dict], List[Document]], + index: Optional[str] = None, + batch_size: int = 10_000, + duplicate_documents: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) ``` Indexes documents for later queries in Elasticsearch. @@ -651,7 +718,10 @@ None #### BaseElasticsearchDocumentStore.write\_labels ```python -def write_labels(labels: Union[List[Label], List[dict]], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000) +def write_labels(labels: Union[List[Label], List[dict]], + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + batch_size: int = 10_000) ``` Write annotation labels into document store. @@ -669,7 +739,10 @@ Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-c #### BaseElasticsearchDocumentStore.update\_document\_meta ```python -def update_document_meta(id: str, meta: Dict[str, str], index: str = None, headers: Optional[Dict[str, str]] = None) +def update_document_meta(id: str, + meta: Dict[str, str], + index: str = None, + headers: Optional[Dict[str, str]] = None) ``` Update the metadata dictionary of a document by specifying its string id @@ -679,7 +752,11 @@ Update the metadata dictionary of a document by specifying its string id #### BaseElasticsearchDocumentStore.get\_document\_count ```python -def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + index: Optional[str] = None, + only_documents_without_embedding: bool = False, + headers: Optional[Dict[str, str]] = None) -> int ``` Return the number of documents in the document store. @@ -689,7 +766,8 @@ Return the number of documents in the document store. #### BaseElasticsearchDocumentStore.get\_label\_count ```python -def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> int +def get_label_count(index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) -> int ``` Return the number of labels in the document store @@ -699,7 +777,11 @@ Return the number of labels in the document store #### BaseElasticsearchDocumentStore.get\_embedding\_count ```python -def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) -> int +def get_embedding_count(index: Optional[str] = None, + filters: Optional[Dict[str, + Union[Dict, List, str, int, + float, bool]]] = None, + headers: Optional[Dict[str, str]] = None) -> int ``` Return the count of embeddings in the document store. @@ -709,7 +791,13 @@ Return the count of embeddings in the document store. #### BaseElasticsearchDocumentStore.get\_all\_documents ```python -def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_all_documents( + index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Get documents from the document store. @@ -753,7 +841,14 @@ Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-c #### BaseElasticsearchDocumentStore.get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator( + index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, + str]] = None) -> Generator[Document, None, None] ``` Get documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -800,7 +895,11 @@ Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-c #### BaseElasticsearchDocumentStore.get\_all\_labels ```python -def get_all_labels(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000) -> List[Label] +def get_all_labels(index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + headers: Optional[Dict[str, str]] = None, + batch_size: int = 10_000) -> List[Label] ``` Return all labels in the document store @@ -810,7 +909,15 @@ Return all labels in the document store #### BaseElasticsearchDocumentStore.query ```python -def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False, scale_score: bool = True) -> List[Document] +def query(query: Optional[str], + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + top_k: int = 10, + custom_query: Optional[str] = None, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + all_terms_must_match: bool = False, + scale_score: bool = True) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -964,12 +1071,19 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### BaseElasticsearchDocumentStore.query\_batch ```python -def query_batch(queries: List[str], filters: Optional[ - Union[ - Dict[str, Union[Dict, List, str, int, float, bool]], - List[Dict[str, Union[Dict, List, str, int, float, bool]]], - ] - ] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False, scale_score: bool = True) -> List[List[Document]] +def query_batch(queries: List[str], + filters: Optional[Union[Dict[str, Union[Dict, List, str, int, + float, bool]], + List[Dict[str, + Union[Dict, List, str, int, + float, + bool]]], ]] = None, + top_k: int = 10, + custom_query: Optional[str] = None, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + all_terms_must_match: bool = False, + scale_score: bool = True) -> List[List[Document]] ``` Scan through documents in DocumentStore and return a small number documents @@ -1063,7 +1177,14 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### BaseElasticsearchDocumentStore.query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + top_k: int = 10, + index: Optional[str] = None, + return_embedding: Optional[bool] = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = True) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -1148,7 +1269,13 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### BaseElasticsearchDocumentStore.update\_embeddings ```python -def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) +def update_embeddings(retriever, + index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + update_existing_embeddings: bool = True, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None) ``` Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -1201,7 +1328,11 @@ None #### BaseElasticsearchDocumentStore.delete\_all\_documents ```python -def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_all_documents(index: Optional[str] = None, + filters: Optional[Dict[str, + Union[Dict, List, str, int, + float, bool]]] = None, + headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -1246,7 +1377,11 @@ None #### BaseElasticsearchDocumentStore.delete\_documents ```python -def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_documents(index: Optional[str] = None, + ids: Optional[List[str]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -1297,7 +1432,11 @@ None #### BaseElasticsearchDocumentStore.delete\_labels ```python -def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_labels(index: Optional[str] = None, + ids: Optional[List[str]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + headers: Optional[Dict[str, str]] = None) ``` Delete labels in an index. All labels are deleted if no filters are passed. @@ -1370,7 +1509,39 @@ class ElasticsearchDocumentStore(BaseElasticsearchDocumentStore) #### ElasticsearchDocumentStore.\_\_init\_\_ ```python -def __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity: str = "dot_product", timeout: int = 30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", use_system_proxy: bool = False) +def __init__(host: Union[str, List[str]] = "localhost", + port: Union[int, List[int]] = 9200, + username: str = "", + password: str = "", + api_key_id: Optional[str] = None, + api_key: Optional[str] = None, + aws4auth=None, + index: str = "document", + label_index: str = "label", + search_fields: Union[str, list] = "content", + content_field: str = "content", + name_field: str = "name", + embedding_field: str = "embedding", + embedding_dim: int = 768, + custom_mapping: Optional[dict] = None, + excluded_meta_data: Optional[list] = None, + analyzer: str = "standard", + scheme: str = "http", + ca_certs: Optional[str] = None, + verify_certs: bool = True, + recreate_index: bool = False, + create_index: bool = True, + refresh_type: str = "wait_for", + similarity: str = "dot_product", + timeout: int = 30, + return_embedding: bool = False, + duplicate_documents: str = "overwrite", + index_type: str = "flat", + scroll: str = "1d", + skip_missing_embeddings: bool = True, + synonyms: Optional[List] = None, + synonym_type: str = "synonym", + use_system_proxy: bool = False) ``` A DocumentStore using Elasticsearch to store and query the documents for our search. @@ -1463,7 +1634,40 @@ class OpenSearchDocumentStore(BaseElasticsearchDocumentStore) #### OpenSearchDocumentStore.\_\_init\_\_ ```python -def __init__(scheme: str = "https", username: str = "admin", password: str = "admin", host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", ca_certs: Optional[str] = None, verify_certs: bool = False, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity: str = "dot_product", timeout: int = 30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", use_system_proxy: bool = False, knn_engine: str = "nmslib") +def __init__(scheme: str = "https", + username: str = "admin", + password: str = "admin", + host: Union[str, List[str]] = "localhost", + port: Union[int, List[int]] = 9200, + api_key_id: Optional[str] = None, + api_key: Optional[str] = None, + aws4auth=None, + index: str = "document", + label_index: str = "label", + search_fields: Union[str, list] = "content", + content_field: str = "content", + name_field: str = "name", + embedding_field: str = "embedding", + embedding_dim: int = 768, + custom_mapping: Optional[dict] = None, + excluded_meta_data: Optional[list] = None, + analyzer: str = "standard", + ca_certs: Optional[str] = None, + verify_certs: bool = False, + recreate_index: bool = False, + create_index: bool = True, + refresh_type: str = "wait_for", + similarity: str = "dot_product", + timeout: int = 30, + return_embedding: bool = False, + duplicate_documents: str = "overwrite", + index_type: str = "flat", + scroll: str = "1d", + skip_missing_embeddings: bool = True, + synonyms: Optional[List] = None, + synonym_type: str = "synonym", + use_system_proxy: bool = False, + knn_engine: str = "nmslib") ``` Document Store using OpenSearch (https://opensearch.org/). It is compatible with the AWS Elasticsearch Service. @@ -1543,7 +1747,14 @@ For more information, see [k-NN Index](https://opensearch.org/docs/latest/search #### OpenSearchDocumentStore.query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + top_k: int = 10, + index: Optional[str] = None, + return_embedding: Optional[bool] = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = True) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -1652,7 +1863,17 @@ In-memory document store #### InMemoryDocumentStore.\_\_init\_\_ ```python -def __init__(index: str = "document", label_index: str = "label", embedding_field: Optional[str] = "embedding", embedding_dim: int = 768, return_embedding: bool = False, similarity: str = "dot_product", progress_bar: bool = True, duplicate_documents: str = "overwrite", use_gpu: bool = True, scoring_batch_size: int = 500000, devices: Optional[List[Union[str, torch.device]]] = None) +def __init__(index: str = "document", + label_index: str = "label", + embedding_field: Optional[str] = "embedding", + embedding_dim: int = 768, + return_embedding: bool = False, + similarity: str = "dot_product", + progress_bar: bool = True, + duplicate_documents: str = "overwrite", + use_gpu: bool = True, + scoring_batch_size: int = 500000, + devices: Optional[List[Union[str, torch.device]]] = None) ``` **Arguments**: @@ -1690,7 +1911,11 @@ parameter is not used and a single cpu device is used for inference. #### InMemoryDocumentStore.write\_documents ```python -def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +def write_documents(documents: Union[List[dict], List[Document]], + index: Optional[str] = None, + batch_size: int = 10_000, + duplicate_documents: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) ``` Indexes documents for later queries. @@ -1718,7 +1943,9 @@ For documents as dictionaries, the format is {"content": ""}. #### InMemoryDocumentStore.write\_labels ```python -def write_labels(labels: Union[List[dict], List[Label]], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +def write_labels(labels: Union[List[dict], List[Label]], + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) ``` Write annotation labels into document store. @@ -1728,7 +1955,10 @@ Write annotation labels into document store. #### InMemoryDocumentStore.get\_document\_by\_id ```python -def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +def get_document_by_id( + id: str, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) -> Optional[Document] ``` Fetch a document by specifying its text id string. @@ -1738,7 +1968,8 @@ Fetch a document by specifying its text id string. #### InMemoryDocumentStore.get\_documents\_by\_id ```python -def get_documents_by_id(ids: List[str], index: Optional[str] = None) -> List[Document] +def get_documents_by_id(ids: List[str], + index: Optional[str] = None) -> List[Document] ``` Fetch documents by specifying a list of text id strings. @@ -1748,7 +1979,8 @@ Fetch documents by specifying a list of text id strings. #### InMemoryDocumentStore.get\_scores\_torch ```python -def get_scores_torch(query_emb: np.ndarray, document_to_search: List[Document]) -> List[float] +def get_scores_torch(query_emb: np.ndarray, + document_to_search: List[Document]) -> List[float] ``` Calculate similarity scores between query embedding and a list of documents using torch. @@ -1763,7 +1995,8 @@ Calculate similarity scores between query embedding and a list of documents usin #### InMemoryDocumentStore.get\_scores\_numpy ```python -def get_scores_numpy(query_emb: np.ndarray, document_to_search: List[Document]) -> List[float] +def get_scores_numpy(query_emb: np.ndarray, + document_to_search: List[Document]) -> List[float] ``` Calculate similarity scores between query embedding and a list of documents using numpy. @@ -1778,7 +2011,13 @@ Calculate similarity scores between query embedding and a list of documents usin #### InMemoryDocumentStore.query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + index: Optional[str] = None, + return_embedding: Optional[bool] = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = True) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -1858,7 +2097,11 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### InMemoryDocumentStore.update\_embeddings ```python -def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000) +def update_embeddings(retriever: "BaseRetriever", + index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, + update_existing_embeddings: bool = True, + batch_size: int = 10_000) ``` Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -1908,7 +2151,10 @@ None #### InMemoryDocumentStore.get\_document\_count ```python -def get_document_count(filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +def get_document_count(filters: Optional[Dict[str, Any]] = None, + index: Optional[str] = None, + only_documents_without_embedding: bool = False, + headers: Optional[Dict[str, str]] = None) -> int ``` Return the number of documents in the document store. @@ -1934,7 +2180,8 @@ Update the metadata dictionary of a document by specifying its string id. #### InMemoryDocumentStore.get\_embedding\_count ```python -def get_embedding_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int +def get_embedding_count(filters: Optional[Dict[str, List[str]]] = None, + index: Optional[str] = None) -> int ``` Return the count of embeddings in the document store. @@ -1944,7 +2191,8 @@ Return the count of embeddings in the document store. #### InMemoryDocumentStore.get\_label\_count ```python -def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> int +def get_label_count(index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) -> int ``` Return the number of labels in the document store. @@ -1954,7 +2202,12 @@ Return the number of labels in the document store. #### InMemoryDocumentStore.get\_all\_documents ```python -def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_all_documents( + index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Get all documents from the document store as a list. @@ -1994,7 +2247,13 @@ Example: #### InMemoryDocumentStore.get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator( + index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, + str]] = None) -> Generator[Document, None, None] ``` Get all documents from the document store. The methods returns a Python Generator that yields individual @@ -2036,7 +2295,9 @@ Example: #### InMemoryDocumentStore.get\_all\_labels ```python -def get_all_labels(index: str = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) -> List[Label] +def get_all_labels(index: str = None, + filters: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None) -> List[Label] ``` Return all labels in the document store. @@ -2046,7 +2307,9 @@ Return all labels in the document store. #### InMemoryDocumentStore.delete\_all\_documents ```python -def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +def delete_all_documents(index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -2088,7 +2351,10 @@ None #### InMemoryDocumentStore.delete\_documents ```python -def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +def delete_documents(index: Optional[str] = None, + ids: Optional[List[str]] = None, + filters: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -2150,7 +2416,10 @@ None #### InMemoryDocumentStore.delete\_labels ```python -def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +def delete_labels(index: Optional[str] = None, + ids: Optional[List[str]] = None, + filters: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None) ``` Delete labels in an index. All labels are deleted if no filters are passed. @@ -2206,7 +2475,12 @@ class SQLDocumentStore(BaseDocumentStore) #### SQLDocumentStore.\_\_init\_\_ ```python -def __init__(url: str = "sqlite://", index: str = "document", label_index: str = "label", duplicate_documents: str = "overwrite", check_same_thread: bool = False, isolation_level: str = None) +def __init__(url: str = "sqlite://", + index: str = "document", + label_index: str = "label", + duplicate_documents: str = "overwrite", + check_same_thread: bool = False, + isolation_level: str = None) ``` An SQL backed DocumentStore. Currently supports SQLite, PostgreSQL and MySQL backends. @@ -2231,7 +2505,10 @@ exists. #### SQLDocumentStore.get\_document\_by\_id ```python -def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +def get_document_by_id( + id: str, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) -> Optional[Document] ``` Fetch a document by specifying its text id string @@ -2241,7 +2518,11 @@ Fetch a document by specifying its text id string #### SQLDocumentStore.get\_documents\_by\_id ```python -def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_documents_by_id( + ids: List[str], + index: Optional[str] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Fetch documents by specifying a list of text id strings @@ -2251,7 +2532,9 @@ Fetch documents by specifying a list of text id strings #### SQLDocumentStore.get\_documents\_by\_vector\_ids ```python -def get_documents_by_vector_ids(vector_ids: List[str], index: Optional[str] = None, batch_size: int = 10_000) +def get_documents_by_vector_ids(vector_ids: List[str], + index: Optional[str] = None, + batch_size: int = 10_000) ``` Fetch documents by specifying a list of text vector id strings @@ -2261,7 +2544,13 @@ Fetch documents by specifying a list of text vector id strings #### SQLDocumentStore.get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator( + index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, + str]] = None) -> Generator[Document, None, None] ``` Get documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -2283,7 +2572,9 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### SQLDocumentStore.get\_all\_labels ```python -def get_all_labels(index=None, filters: Optional[dict] = None, headers: Optional[Dict[str, str]] = None) +def get_all_labels(index=None, + filters: Optional[dict] = None, + headers: Optional[Dict[str, str]] = None) ``` Return all labels in the document store @@ -2293,7 +2584,11 @@ Return all labels in the document store #### SQLDocumentStore.write\_documents ```python -def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> None +def write_documents(documents: Union[List[dict], List[Document]], + index: Optional[str] = None, + batch_size: int = 10_000, + duplicate_documents: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) -> None ``` Indexes documents for later queries. @@ -2335,7 +2630,9 @@ Write annotation labels into document store. #### SQLDocumentStore.update\_vector\_ids ```python -def update_vector_ids(vector_id_map: Dict[str, str], index: Optional[str] = None, batch_size: int = 10_000) +def update_vector_ids(vector_id_map: Dict[str, str], + index: Optional[str] = None, + batch_size: int = 10_000) ``` Update vector_ids for given document_ids. @@ -2371,7 +2668,10 @@ Update the metadata dictionary of a document by specifying its string id #### SQLDocumentStore.get\_document\_count ```python -def get_document_count(filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +def get_document_count(filters: Optional[Dict[str, Any]] = None, + index: Optional[str] = None, + only_documents_without_embedding: bool = False, + headers: Optional[Dict[str, str]] = None) -> int ``` Return the number of documents in the document store. @@ -2381,7 +2681,8 @@ Return the number of documents in the document store. #### SQLDocumentStore.get\_label\_count ```python -def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> int +def get_label_count(index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) -> int ``` Return the number of labels in the document store @@ -2391,7 +2692,9 @@ Return the number of labels in the document store #### SQLDocumentStore.delete\_all\_documents ```python -def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +def delete_all_documents(index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -2410,7 +2713,10 @@ None #### SQLDocumentStore.delete\_documents ```python -def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +def delete_documents(index: Optional[str] = None, + ids: Optional[List[str]] = None, + filters: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -2453,7 +2759,10 @@ None #### SQLDocumentStore.delete\_labels ```python -def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +def delete_labels(index: Optional[str] = None, + ids: Optional[List[str]] = None, + filters: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None) ``` Delete labels from the document store. All labels are deleted if no filters are passed. @@ -2495,7 +2804,24 @@ the vector embeddings are indexed in a FAISS Index. #### FAISSDocumentStore.\_\_init\_\_ ```python -def __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, n_links: int = 64, ef_search: int = 20, ef_construction: int = 80, validate_index_sync: bool = True) +def __init__(sql_url: str = "sqlite:///faiss_document_store.db", + vector_dim: int = None, + embedding_dim: int = 768, + faiss_index_factory_str: str = "Flat", + faiss_index: Optional[faiss.swigfaiss.Index] = None, + return_embedding: bool = False, + index: str = "document", + similarity: str = "dot_product", + embedding_field: str = "embedding", + progress_bar: bool = True, + duplicate_documents: str = "overwrite", + faiss_index_path: Union[str, Path] = None, + faiss_config_path: Union[str, Path] = None, + isolation_level: str = None, + n_links: int = 64, + ef_search: int = 20, + ef_construction: int = 80, + validate_index_sync: bool = True) ``` **Arguments**: @@ -2552,7 +2878,11 @@ Can be created via calling `save()` #### FAISSDocumentStore.write\_documents ```python -def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> None +def write_documents(documents: Union[List[dict], List[Document]], + index: Optional[str] = None, + batch_size: int = 10_000, + duplicate_documents: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) -> None ``` Add new documents to the DocumentStore. @@ -2583,7 +2913,11 @@ None #### FAISSDocumentStore.update\_embeddings ```python -def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None, batch_size: int = 10_000) +def update_embeddings(retriever: "BaseRetriever", + index: Optional[str] = None, + update_existing_embeddings: bool = True, + filters: Optional[Dict[str, Any]] = None, + batch_size: int = 10_000) ``` Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -2611,7 +2945,13 @@ None #### FAISSDocumentStore.get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator( + index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, + str]] = None) -> Generator[Document, None, None] ``` Get all documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -2633,7 +2973,8 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### FAISSDocumentStore.get\_embedding\_count ```python -def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int +def get_embedding_count(index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None) -> int ``` Return the count of embeddings in the document store. @@ -2643,7 +2984,9 @@ Return the count of embeddings in the document store. #### FAISSDocumentStore.train\_index ```python -def train_index(documents: Optional[Union[List[dict], List[Document]]], embeddings: Optional[np.ndarray] = None, index: Optional[str] = None) +def train_index(documents: Optional[Union[List[dict], List[Document]]], + embeddings: Optional[np.ndarray] = None, + index: Optional[str] = None) ``` Some FAISS indices (e.g. IVF) require initial "training" on a sample of vectors before you can add your final vectors. @@ -2666,7 +3009,9 @@ None #### FAISSDocumentStore.delete\_all\_documents ```python -def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +def delete_all_documents(index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None) ``` Delete all documents from the document store. @@ -2676,7 +3021,10 @@ Delete all documents from the document store. #### FAISSDocumentStore.delete\_documents ```python -def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +def delete_documents(index: Optional[str] = None, + ids: Optional[List[str]] = None, + filters: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None) ``` Delete documents from the document store. All documents are deleted if no filters are passed. @@ -2719,7 +3067,13 @@ None #### FAISSDocumentStore.query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + index: Optional[str] = None, + return_embedding: Optional[bool] = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = True) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -2741,7 +3095,8 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### FAISSDocumentStore.save ```python -def save(index_path: Union[str, Path], config_path: Optional[Union[str, Path]] = None) +def save(index_path: Union[str, Path], + config_path: Optional[Union[str, Path]] = None) ``` Save FAISS Index to the specified file. @@ -2765,7 +3120,9 @@ None ```python @classmethod -def load(cls, index_path: Union[str, Path], config_path: Optional[Union[str, Path]] = None) +def load(cls, + index_path: Union[str, Path], + config_path: Optional[Union[str, Path]] = None) ``` Load a saved FAISS index from a file and connect to the SQL database. @@ -2812,7 +3169,22 @@ Usage: #### Milvus1DocumentStore.\_\_init\_\_ ```python -def __init__(sql_url: str = "sqlite:///", milvus_url: str = "tcp://localhost:19530", connection_pool: str = "SingletonThread", index: str = "document", vector_dim: int = None, embedding_dim: int = 768, index_file_size: int = 1024, similarity: str = "dot_product", index_type: IndexType = IndexType.FLAT, index_param: Optional[Dict[str, Any]] = None, search_param: Optional[Dict[str, Any]] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", isolation_level: str = None) +def __init__(sql_url: str = "sqlite:///", + milvus_url: str = "tcp://localhost:19530", + connection_pool: str = "SingletonThread", + index: str = "document", + vector_dim: int = None, + embedding_dim: int = 768, + index_file_size: int = 1024, + similarity: str = "dot_product", + index_type: IndexType = IndexType.FLAT, + index_param: Optional[Dict[str, Any]] = None, + search_param: Optional[Dict[str, Any]] = None, + return_embedding: bool = False, + embedding_field: str = "embedding", + progress_bar: bool = True, + duplicate_documents: str = "overwrite", + isolation_level: str = None) ``` **WARNING:** Milvus1DocumentStore is deprecated and will be removed in a future version. Please switch to Milvus2 @@ -2869,7 +3241,12 @@ exists. #### Milvus1DocumentStore.write\_documents ```python -def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None, index_param: Optional[Dict[str, Any]] = None) +def write_documents(documents: Union[List[dict], List[Document]], + index: Optional[str] = None, + batch_size: int = 10_000, + duplicate_documents: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + index_param: Optional[Dict[str, Any]] = None) ``` Add new documents to the DocumentStore. @@ -2900,7 +3277,11 @@ None #### Milvus1DocumentStore.update\_embeddings ```python -def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, batch_size: int = 10_000, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None) +def update_embeddings(retriever: "BaseRetriever", + index: Optional[str] = None, + batch_size: int = 10_000, + update_existing_embeddings: bool = True, + filters: Optional[Dict[str, Any]] = None) ``` Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -2928,7 +3309,13 @@ None #### Milvus1DocumentStore.query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + index: Optional[str] = None, + return_embedding: Optional[bool] = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = True) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -2954,7 +3341,9 @@ list of Documents that are the most similar to `query_emb` #### Milvus1DocumentStore.delete\_all\_documents ```python -def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +def delete_all_documents(index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None) ``` Delete all documents (from SQL AND Milvus). @@ -2974,7 +3363,10 @@ None #### Milvus1DocumentStore.delete\_documents ```python -def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +def delete_documents(index: Optional[str] = None, + ids: Optional[List[str]] = None, + filters: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -3017,7 +3409,13 @@ None #### Milvus1DocumentStore.get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator( + index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, + str]] = None) -> Generator[Document, None, None] ``` Get all documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -3039,7 +3437,12 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### Milvus1DocumentStore.get\_all\_documents ```python -def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_all_documents( + index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Get documents from the document store (optionally using filter criteria). @@ -3058,7 +3461,10 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### Milvus1DocumentStore.get\_document\_by\_id ```python -def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +def get_document_by_id( + id: str, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) -> Optional[Document] ``` Fetch a document by specifying its text id string @@ -3074,7 +3480,11 @@ DocumentStore's default index (self.index) will be used. #### Milvus1DocumentStore.get\_documents\_by\_id ```python -def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_documents_by_id( + ids: List[str], + index: Optional[str] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Fetch multiple documents by specifying their IDs (strings) @@ -3110,7 +3520,8 @@ List[np.array]: List of vectors. #### Milvus1DocumentStore.get\_embedding\_count ```python -def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int +def get_embedding_count(index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None) -> int ``` Return the count of embeddings in the document store. @@ -3159,7 +3570,27 @@ does not allow these data types (yet). #### Milvus2DocumentStore.\_\_init\_\_ ```python -def __init__(sql_url: str = "sqlite:///", host: str = "localhost", port: str = "19530", connection_pool: str = "SingletonThread", index: str = "document", vector_dim: int = None, embedding_dim: int = 768, index_file_size: int = 1024, similarity: str = "dot_product", index_type: str = "IVF_FLAT", index_param: Optional[Dict[str, Any]] = None, search_param: Optional[Dict[str, Any]] = None, return_embedding: bool = False, embedding_field: str = "embedding", id_field: str = "id", custom_fields: Optional[List[Any]] = None, progress_bar: bool = True, duplicate_documents: str = "overwrite", isolation_level: str = None, consistency_level: int = 0, recreate_index: bool = False) +def __init__(sql_url: str = "sqlite:///", + host: str = "localhost", + port: str = "19530", + connection_pool: str = "SingletonThread", + index: str = "document", + vector_dim: int = None, + embedding_dim: int = 768, + index_file_size: int = 1024, + similarity: str = "dot_product", + index_type: str = "IVF_FLAT", + index_param: Optional[Dict[str, Any]] = None, + search_param: Optional[Dict[str, Any]] = None, + return_embedding: bool = False, + embedding_field: str = "embedding", + id_field: str = "id", + custom_fields: Optional[List[Any]] = None, + progress_bar: bool = True, + duplicate_documents: str = "overwrite", + isolation_level: str = None, + consistency_level: int = 0, + recreate_index: bool = False) ``` **Arguments**: @@ -3218,7 +3649,12 @@ be recreated. #### Milvus2DocumentStore.write\_documents ```python -def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None, index_param: Optional[Dict[str, Any]] = None) +def write_documents(documents: Union[List[dict], List[Document]], + index: Optional[str] = None, + batch_size: int = 10_000, + duplicate_documents: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + index_param: Optional[Dict[str, Any]] = None) ``` Add new documents to the DocumentStore. @@ -3245,7 +3681,11 @@ exists. #### Milvus2DocumentStore.update\_embeddings ```python -def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, batch_size: int = 10_000, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None) +def update_embeddings(retriever: "BaseRetriever", + index: Optional[str] = None, + batch_size: int = 10_000, + update_existing_embeddings: bool = True, + filters: Optional[Dict[str, Any]] = None) ``` Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -3273,7 +3713,13 @@ None #### Milvus2DocumentStore.query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + index: Optional[str] = None, + return_embedding: Optional[bool] = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = True) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -3295,7 +3741,11 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### Milvus2DocumentStore.delete\_documents ```python -def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000) +def delete_documents(index: Optional[str] = None, + ids: Optional[List[str]] = None, + filters: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None, + batch_size: int = 10_000) ``` Delete all documents (from SQL AND Milvus). @@ -3333,7 +3783,13 @@ None #### Milvus2DocumentStore.get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator( + index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, + str]] = None) -> Generator[Document, None, None] ``` Get all documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -3355,7 +3811,12 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### Milvus2DocumentStore.get\_all\_documents ```python -def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_all_documents( + index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Get documents from the document store (optionally using filter criteria). @@ -3374,7 +3835,10 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### Milvus2DocumentStore.get\_document\_by\_id ```python -def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +def get_document_by_id( + id: str, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) -> Optional[Document] ``` Fetch a document by specifying its text id string @@ -3390,7 +3854,11 @@ DocumentStore's default index (self.index) will be used. #### Milvus2DocumentStore.get\_documents\_by\_id ```python -def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_documents_by_id( + ids: List[str], + index: Optional[str] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Fetch multiple documents by specifying their IDs (strings) @@ -3407,7 +3875,8 @@ DocumentStore's default index (self.index) will be used. #### Milvus2DocumentStore.get\_embedding\_count ```python -def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int +def get_embedding_count(index: Optional[str] = None, + filters: Optional[Dict[str, List[str]]] = None) -> int ``` Return the count of embeddings in the document store. @@ -3448,7 +3917,23 @@ The current implementation is not supporting the storage of labels, so you canno #### WeaviateDocumentStore.\_\_init\_\_ ```python -def __init__(host: Union[str, List[str]] = "http://localhost", port: Union[int, List[int]] = 8080, timeout_config: tuple = (5, 15), username: str = None, password: str = None, index: str = "Document", embedding_dim: int = 768, content_field: str = "content", name_field: str = "name", similarity: str = "cosine", index_type: str = "hnsw", custom_schema: Optional[dict] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", recreate_index: bool = False) +def __init__(host: Union[str, List[str]] = "http://localhost", + port: Union[int, List[int]] = 8080, + timeout_config: tuple = (5, 15), + username: str = None, + password: str = None, + index: str = "Document", + embedding_dim: int = 768, + content_field: str = "content", + name_field: str = "name", + similarity: str = "cosine", + index_type: str = "hnsw", + custom_schema: Optional[dict] = None, + return_embedding: bool = False, + embedding_field: str = "embedding", + progress_bar: bool = True, + duplicate_documents: str = "overwrite", + recreate_index: bool = False) ``` **Arguments**: @@ -3491,7 +3976,10 @@ lost if you choose to recreate the index. #### WeaviateDocumentStore.get\_document\_by\_id ```python -def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +def get_document_by_id( + id: str, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) -> Optional[Document] ``` Fetch a document by specifying its uuid string @@ -3501,7 +3989,11 @@ Fetch a document by specifying its uuid string #### WeaviateDocumentStore.get\_documents\_by\_id ```python -def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_documents_by_id( + ids: List[str], + index: Optional[str] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Fetch documents by specifying a list of uuid strings. @@ -3511,7 +4003,11 @@ Fetch documents by specifying a list of uuid strings. #### WeaviateDocumentStore.write\_documents ```python -def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +def write_documents(documents: Union[List[dict], List[Document]], + index: Optional[str] = None, + batch_size: int = 10_000, + duplicate_documents: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) ``` Add new documents to the DocumentStore. @@ -3541,7 +4037,9 @@ None #### WeaviateDocumentStore.update\_document\_meta ```python -def update_document_meta(id: str, meta: Dict[str, Union[List, str, int, float, bool]], index: str = None) +def update_document_meta(id: str, + meta: Dict[str, Union[List, str, int, float, bool]], + index: str = None) ``` Update the metadata dictionary of a document by specifying its string id. @@ -3552,7 +4050,10 @@ Overwrites only the specified fields, the unspecified ones remain unchanged. #### WeaviateDocumentStore.get\_embedding\_count ```python -def get_embedding_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None) -> int +def get_embedding_count(filters: Optional[Dict[str, + Union[Dict, List, str, int, + float, bool]]] = None, + index: Optional[str] = None) -> int ``` Return the number of embeddings in the document store, which is the same as the number of documents since @@ -3563,7 +4064,11 @@ every document has a default embedding. #### WeaviateDocumentStore.get\_document\_count ```python -def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + index: Optional[str] = None, + only_documents_without_embedding: bool = False, + headers: Optional[Dict[str, str]] = None) -> int ``` Return the number of documents in the document store. @@ -3573,7 +4078,13 @@ Return the number of documents in the document store. #### WeaviateDocumentStore.get\_all\_documents ```python -def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_all_documents( + index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Get documents from the document store. @@ -3632,7 +4143,14 @@ operation. #### WeaviateDocumentStore.get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator( + index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, + str]] = None) -> Generator[Document, None, None] ``` Get documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -3694,7 +4212,15 @@ operation. #### WeaviateDocumentStore.query ```python -def query(query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, all_terms_must_match: bool = False, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +def query(query: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + top_k: int = 10, + all_terms_must_match: bool = False, + custom_query: Optional[str] = None, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = True) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -3782,7 +4308,14 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### WeaviateDocumentStore.query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + top_k: int = 10, + index: Optional[str] = None, + return_embedding: Optional[bool] = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = True) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -3865,7 +4398,12 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### WeaviateDocumentStore.update\_embeddings ```python -def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000) +def update_embeddings(retriever, + index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + update_existing_embeddings: bool = True, + batch_size: int = 10_000) ``` Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -3915,7 +4453,11 @@ None #### WeaviateDocumentStore.delete\_all\_documents ```python -def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_all_documents(index: Optional[str] = None, + filters: Optional[Dict[str, + Union[Dict, List, str, int, + float, bool]]] = None, + headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -3959,7 +4501,11 @@ None #### WeaviateDocumentStore.delete\_documents ```python -def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_documents(index: Optional[str] = None, + ids: Optional[List[str]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -4088,7 +4634,12 @@ Knowledge graph store that runs on a GraphDB instance. #### GraphDBKnowledgeGraph.\_\_init\_\_ ```python -def __init__(host: str = "localhost", port: int = 7200, username: str = "", password: str = "", index: Optional[str] = None, prefixes: str = "") +def __init__(host: str = "localhost", + port: int = 7200, + username: str = "", + password: str = "", + index: Optional[str] = None, + prefixes: str = "") ``` Init the knowledge graph by defining the settings to connect with a GraphDB instance @@ -4137,7 +4688,9 @@ Delete the index that GraphDBKnowledgeGraph is connected to. This method deletes #### GraphDBKnowledgeGraph.import\_from\_ttl\_file ```python -def import_from_ttl_file(index: str, path: Path, headers: Optional[Dict[str, str]] = None) +def import_from_ttl_file(index: str, + path: Path, + headers: Optional[Dict[str, str]] = None) ``` Load an existing knowledge graph represented in the form of triples of subject, predicate, and object from a .ttl file into an index of GraphDB @@ -4153,7 +4706,8 @@ Load an existing knowledge graph represented in the form of triples of subject, #### GraphDBKnowledgeGraph.get\_all\_triples ```python -def get_all_triples(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +def get_all_triples(index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) ``` Query the given index in the GraphDB instance for all its stored triples. Duplicates are not filtered. @@ -4172,7 +4726,8 @@ all triples stored in the index #### GraphDBKnowledgeGraph.get\_all\_subjects ```python -def get_all_subjects(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +def get_all_subjects(index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) ``` Query the given index in the GraphDB instance for all its stored subjects. Duplicates are not filtered. @@ -4191,7 +4746,8 @@ all subjects stored in the index #### GraphDBKnowledgeGraph.get\_all\_predicates ```python -def get_all_predicates(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +def get_all_predicates(index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) ``` Query the given index in the GraphDB instance for all its stored predicates. Duplicates are not filtered. @@ -4210,7 +4766,8 @@ all predicates stored in the index #### GraphDBKnowledgeGraph.get\_all\_objects ```python -def get_all_objects(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +def get_all_objects(index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) ``` Query the given index in the GraphDB instance for all its stored objects. Duplicates are not filtered. @@ -4229,7 +4786,9 @@ all objects stored in the index #### GraphDBKnowledgeGraph.query ```python -def query(sparql_query: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +def query(sparql_query: str, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) ``` Execute a SPARQL query on the given index in the GraphDB instance @@ -4271,7 +4830,15 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore) #### DeepsetCloudDocumentStore.\_\_init\_\_ ```python -def __init__(api_key: str = None, workspace: str = "default", index: Optional[str] = None, duplicate_documents: str = "overwrite", api_endpoint: Optional[str] = None, similarity: str = "dot_product", return_embedding: bool = False, label_index: str = "default", embedding_dim: int = 768) +def __init__(api_key: str = None, + workspace: str = "default", + index: Optional[str] = None, + duplicate_documents: str = "overwrite", + api_endpoint: Optional[str] = None, + similarity: str = "dot_product", + return_embedding: bool = False, + label_index: str = "default", + embedding_dim: int = 768) ``` A DocumentStore facade enabling you to interact with the documents stored in deepset Cloud. @@ -4321,7 +4888,13 @@ more performant with DPR embeddings. 'cosine' is recommended if you are using a #### DeepsetCloudDocumentStore.get\_all\_documents ```python -def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_all_documents( + index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Get documents from the document store. @@ -4365,7 +4938,14 @@ operation. #### DeepsetCloudDocumentStore.get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator( + index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, + str]] = None) -> Generator[Document, None, None] ``` Get documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -4412,7 +4992,14 @@ operation. #### DeepsetCloudDocumentStore.query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + top_k: int = 10, + index: Optional[str] = None, + return_embedding: Optional[bool] = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = True) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -4496,7 +5083,15 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### DeepsetCloudDocumentStore.query ```python -def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False, scale_score: bool = True) -> List[Document] +def query(query: Optional[str], + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + top_k: int = 10, + custom_query: Optional[str] = None, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + all_terms_must_match: bool = False, + scale_score: bool = True) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -4587,7 +5182,11 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. ```python @disable_and_log -def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +def write_documents(documents: Union[List[dict], List[Document]], + index: Optional[str] = None, + batch_size: int = 10_000, + duplicate_documents: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) ``` Indexes documents for later queries. @@ -4651,7 +5250,10 @@ These contain ("name", "evaluation_set_id", "created_at", "matched_labels", "tot #### DeepsetCloudDocumentStore.get\_all\_labels ```python -def get_all_labels(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) -> List[Label] +def get_all_labels(index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + headers: Optional[Dict[str, str]] = None) -> List[Label] ``` Returns a list of labels for the given index name. @@ -4671,7 +5273,8 @@ list of Labels. #### DeepsetCloudDocumentStore.get\_label\_count ```python -def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> int +def get_label_count(index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) -> int ``` Counts the number of labels for the given index and returns the value. @@ -4714,7 +5317,21 @@ the vector embeddings and metadata (for filtering) are indexed in a Pinecone Ind #### PineconeDocumentStore.\_\_init\_\_ ```python -def __init__(api_key: str, environment: str = "us-west1-gcp", pinecone_index: Optional[pinecone.Index] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", similarity: str = "cosine", replicas: int = 1, shards: int = 1, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", recreate_index: bool = False, metadata_config: dict = {"indexed": []}, validate_index_sync: bool = True) +def __init__(api_key: str, + environment: str = "us-west1-gcp", + pinecone_index: Optional[pinecone.Index] = None, + embedding_dim: int = 768, + return_embedding: bool = False, + index: str = "document", + similarity: str = "cosine", + replicas: int = 1, + shards: int = 1, + embedding_field: str = "embedding", + progress_bar: bool = True, + duplicate_documents: str = "overwrite", + recreate_index: bool = False, + metadata_config: dict = {"indexed": []}, + validate_index_sync: bool = True) ``` **Arguments**: @@ -4757,7 +5374,11 @@ no fields are indexed. #### PineconeDocumentStore.get\_document\_count ```python -def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + index: Optional[str] = None, + only_documents_without_embedding: bool = False, + headers: Optional[Dict[str, str]] = None) -> int ``` Return the count of embeddings in the document store. @@ -4797,7 +5418,12 @@ operation. #### PineconeDocumentStore.write\_documents ```python -def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 32, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None, labels: Optional[bool] = False) +def write_documents(documents: Union[List[dict], List[Document]], + index: Optional[str] = None, + batch_size: int = 32, + duplicate_documents: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + labels: Optional[bool] = False) ``` Add new documents to the DocumentStore. @@ -4826,7 +5452,12 @@ Parameter options: #### PineconeDocumentStore.update\_embeddings ```python -def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, batch_size: int = 32) +def update_embeddings(retriever: "BaseRetriever", + index: Optional[str] = None, + update_existing_embeddings: bool = True, + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + batch_size: int = 32) ``` Updates the embeddings in the document store using the encoding model specified in the retriever. @@ -4874,7 +5505,13 @@ batching can help reduce memory footprint. #### PineconeDocumentStore.get\_all\_documents ```python -def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None, namespace: Optional[str] = None) -> List[Document] +def get_all_documents(index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 32, + headers: Optional[Dict[str, str]] = None, + namespace: Optional[str] = None) -> List[Document] ``` Retrieves all documents in the index. @@ -4917,7 +5554,14 @@ batching can help reduce memory footprint. #### PineconeDocumentStore.get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None, namespace: Optional[str] = None) -> Generator[Document, None, None] +def get_all_documents_generator( + index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 32, + headers: Optional[Dict[str, str]] = None, + namespace: Optional[str] = None) -> Generator[Document, None, None] ``` Get all documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -4963,7 +5607,12 @@ operation. #### PineconeDocumentStore.get\_documents\_by\_id ```python -def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None, return_embedding: Optional[bool] = None, namespace: str = None) -> List[Document] +def get_documents_by_id(ids: List[str], + index: Optional[str] = None, + batch_size: int = 32, + headers: Optional[Dict[str, str]] = None, + return_embedding: Optional[bool] = None, + namespace: str = None) -> List[Document] ``` Retrieves all documents in the index using their IDs. @@ -4983,7 +5632,11 @@ batching can help reduce memory footprint. #### PineconeDocumentStore.get\_document\_by\_id ```python -def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, return_embedding: Optional[bool] = None, namespace: str = None) -> Document +def get_document_by_id(id: str, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + return_embedding: Optional[bool] = None, + namespace: str = None) -> Document ``` Returns a single Document retrieved using an ID. @@ -5001,7 +5654,11 @@ Returns a single Document retrieved using an ID. #### PineconeDocumentStore.get\_embedding\_count ```python -def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None) -> int +def get_embedding_count( + index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None +) -> int ``` Return the count of embeddings in the document store. @@ -5016,7 +5673,10 @@ Return the count of embeddings in the document store. #### PineconeDocumentStore.update\_document\_meta ```python -def update_document_meta(id: str, meta: Dict[str, str], namespace: str = None, index: str = None) +def update_document_meta(id: str, + meta: Dict[str, str], + namespace: str = None, + index: str = None) ``` Update the metadata dictionary of a document by specifying its string ID. @@ -5034,7 +5694,13 @@ namespace (vectors) if it exists, otherwise the document namespace (no-vectors). #### PineconeDocumentStore.delete\_documents ```python -def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, drop_ids: Optional[bool] = True, namespace: Optional[str] = None) +def delete_documents(index: Optional[str] = None, + ids: Optional[List[str]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + headers: Optional[Dict[str, str]] = None, + drop_ids: Optional[bool] = True, + namespace: Optional[str] = None) ``` Delete documents from the document store. @@ -5103,7 +5769,15 @@ None #### PineconeDocumentStore.query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True, namespace: Optional[str] = None) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + top_k: int = 10, + index: Optional[str] = None, + return_embedding: Optional[bool] = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = True, + namespace: Optional[str] = None) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -5195,7 +5869,11 @@ Default class method used for loading indexes. Not applicable to PineconeDocumen #### PineconeDocumentStore.delete\_labels ```python -def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 32) +def delete_labels(index: Optional[str] = None, + ids: Optional[List[str]] = None, + filters: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None, + batch_size: int = 32) ``` Default class method used for deleting labels. Not supported by PineconeDocumentStore. @@ -5205,7 +5883,9 @@ Default class method used for deleting labels. Not supported by PineconeDocument #### PineconeDocumentStore.get\_all\_labels ```python -def get_all_labels(index=None, filters: Optional[dict] = None, headers: Optional[Dict[str, str]] = None) +def get_all_labels(index=None, + filters: Optional[dict] = None, + headers: Optional[Dict[str, str]] = None) ``` Default class method used for getting all labels. @@ -5215,7 +5895,8 @@ Default class method used for getting all labels. #### PineconeDocumentStore.get\_label\_count ```python -def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +def get_label_count(index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) ``` Default class method used for counting labels. Not supported by PineconeDocumentStore. @@ -5239,7 +5920,11 @@ Default class method used for writing labels. #### eval\_data\_from\_json ```python -def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None, open_domain: bool = False) -> Tuple[List[Document], List[Label]] +def eval_data_from_json( + filename: str, + max_docs: Union[int, bool] = None, + preprocessor: PreProcessor = None, + open_domain: bool = False) -> Tuple[List[Document], List[Label]] ``` Read Documents + Labels from a SQuAD-style file. @@ -5257,7 +5942,13 @@ Document and Labels can then be indexed to the DocumentStore and be used for eva #### eval\_data\_from\_jsonl ```python -def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None, max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None, open_domain: bool = False) -> Generator[Tuple[List[Document], List[Label]], None, None] +def eval_data_from_jsonl( + filename: str, + batch_size: Optional[int] = None, + max_docs: Union[int, bool] = None, + preprocessor: PreProcessor = None, + open_domain: bool = False +) -> Generator[Tuple[List[Document], List[Label]], None, None] ``` Read Documents + Labels from a SQuAD-style file in jsonl format, i.e. one document per line. diff --git a/docs/_src/api/api/evaluation.md b/docs/_src/api/api/evaluation.md index 7db66a452..a2ed13102 100644 --- a/docs/_src/api/api/evaluation.md +++ b/docs/_src/api/api/evaluation.md @@ -40,7 +40,9 @@ When False, correct retrieval is evaluated based on document_id. #### EvalDocuments.run ```python -def run(documents: List[Document], labels: List[Label], top_k: Optional[int] = None) +def run(documents: List[Document], + labels: List[Label], + top_k: Optional[int] = None) ``` Run this node on one sample and its labels @@ -78,7 +80,10 @@ Please use pipeline.eval() instead. #### EvalAnswers.\_\_init\_\_ ```python -def __init__(skip_incorrect_retrieval: bool = True, open_domain: bool = True, sas_model: str = None, debug: bool = False) +def __init__(skip_incorrect_retrieval: bool = True, + open_domain: bool = True, + sas_model: str = None, + debug: bool = False) ``` **Arguments**: @@ -123,7 +128,15 @@ Print the evaluation results #### semantic\_answer\_similarity ```python -def semantic_answer_similarity(predictions: List[List[str]], gold_labels: List[List[str]], sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", batch_size: int = 32, use_gpu: bool = True, use_auth_token: Optional[Union[str, bool]] = None) -> Tuple[List[float], List[float], List[List[float]]] +def semantic_answer_similarity( + predictions: List[List[str]], + gold_labels: List[List[str]], + sas_model_name_or_path: + str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", + batch_size: int = 32, + use_gpu: bool = True, + use_auth_token: Optional[Union[str, bool]] = None +) -> Tuple[List[float], List[float], List[List[float]]] ``` Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1. diff --git a/docs/_src/api/api/extractor.md b/docs/_src/api/api/extractor.md index 339fa8069..8eb8101de 100644 --- a/docs/_src/api/api/extractor.md +++ b/docs/_src/api/api/extractor.md @@ -39,7 +39,9 @@ parameter is not used and a single cpu device is used for inference. #### EntityExtractor.run ```python -def run(documents: Optional[Union[List[Document], List[dict]]] = None) -> Tuple[Dict, str] +def run( + documents: Optional[Union[List[Document], List[dict]]] = None +) -> Tuple[Dict, str] ``` This is the method called when this node is used in a pipeline @@ -59,7 +61,8 @@ This function can be called to perform entity extraction when using the node in #### EntityExtractor.extract\_batch ```python -def extract_batch(texts: Union[List[str], List[List[str]]], batch_size: Optional[int] = None) +def extract_batch(texts: Union[List[str], List[List[str]]], + batch_size: Optional[int] = None) ``` This function allows to extract entities out of a list of strings or a list of lists of strings. diff --git a/docs/_src/api/api/file_classifier.md b/docs/_src/api/api/file_classifier.md index 8ffe4963b..767001b56 100644 --- a/docs/_src/api/api/file_classifier.md +++ b/docs/_src/api/api/file_classifier.md @@ -33,7 +33,8 @@ Lists with duplicate elements are not allowed. #### FileTypeClassifier.run ```python -def run(file_paths: Union[Path, List[Path], str, List[str], List[Union[Path, str]]]) +def run(file_paths: Union[Path, List[Path], str, List[str], List[Union[Path, + str]]]) ``` Sends out files on a different output edge depending on their extension. diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md index c1c471017..0d2eb4374 100644 --- a/docs/_src/api/api/file_converter.md +++ b/docs/_src/api/api/file_converter.md @@ -17,7 +17,10 @@ Base class for implementing file converts to transform input documents to text f #### BaseConverter.\_\_init\_\_ ```python -def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True) +def __init__(remove_numeric_tables: bool = False, + valid_languages: Optional[List[str]] = None, + id_hash_keys: Optional[List[str]] = None, + progress_bar: bool = True) ``` **Arguments**: @@ -44,7 +47,12 @@ In this case the id will be generated by using the content and the defined metad ```python @abstractmethod -def convert(file_path: Path, meta: Optional[Dict[str, Any]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, + meta: Optional[Dict[str, Any]], + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = "UTF-8", + id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Convert a file to a dictionary containing the text and any associated meta data. @@ -77,7 +85,8 @@ In this case the id will be generated by using the content and the defined metad #### BaseConverter.validate\_language ```python -def validate_language(text: str, valid_languages: Optional[List[str]] = None) -> bool +def validate_language(text: str, + valid_languages: Optional[List[str]] = None) -> bool ``` Validate if the language of the text is one of valid languages. @@ -87,7 +96,14 @@ Validate if the language of the text is one of valid languages. #### BaseConverter.run ```python -def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) +def run(file_paths: Union[Path, List[Path]], + meta: Optional[Union[Dict[str, str], + List[Optional[Dict[str, str]]]]] = None, + remove_numeric_tables: Optional[bool] = None, + known_ligatures: Dict[str, str] = KNOWN_LIGATURES, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = "UTF-8", + id_hash_keys: Optional[List[str]] = None) ``` Extract text from a file. @@ -137,7 +153,12 @@ class DocxToTextConverter(BaseConverter) #### DocxToTextConverter.convert ```python -def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = None, + id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Extract text from a .docx file. @@ -182,7 +203,9 @@ class ImageToTextConverter(BaseConverter) #### ImageToTextConverter.\_\_init\_\_ ```python -def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None) +def __init__(remove_numeric_tables: bool = False, + valid_languages: Optional[List[str]] = ["eng"], + id_hash_keys: Optional[List[str]] = None) ``` **Arguments**: @@ -209,7 +232,12 @@ In this case the id will be generated by using the content and the defined metad #### ImageToTextConverter.convert ```python -def convert(file_path: Union[Path, str], meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Union[Path, str], + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = None, + id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) @@ -252,7 +280,12 @@ class MarkdownConverter(BaseConverter) #### MarkdownConverter.convert ```python -def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = "utf-8", + id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Reads text from a txt file and executes optional preprocessing steps. @@ -301,7 +334,10 @@ class PDFToTextConverter(BaseConverter) #### PDFToTextConverter.\_\_init\_\_ ```python -def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8") +def __init__(remove_numeric_tables: bool = False, + valid_languages: Optional[List[str]] = None, + id_hash_keys: Optional[List[str]] = None, + encoding: Optional[str] = "UTF-8") ``` **Arguments**: @@ -329,7 +365,12 @@ Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, #### PDFToTextConverter.convert ```python -def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = None, + id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html) @@ -369,7 +410,9 @@ class PDFToTextOCRConverter(BaseConverter) #### PDFToTextOCRConverter.\_\_init\_\_ ```python -def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None) +def __init__(remove_numeric_tables: bool = False, + valid_languages: Optional[List[str]] = ["eng"], + id_hash_keys: Optional[List[str]] = None) ``` Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) @@ -396,7 +439,12 @@ In this case the id will be generated by using the content and the defined metad #### PDFToTextOCRConverter.convert ```python -def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = None, + id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Convert a file to a dictionary containing the text and any associated meta data. @@ -446,7 +494,17 @@ Supported file formats are: PDF, DOCX #### ParsrConverter.\_\_init\_\_ ```python -def __init__(parsr_url: str = "http://localhost:3001", extractor: Literal["pdfminer", "pdfjs"] = "pdfminer", table_detection_mode: Literal["lattice", "stream"] = "lattice", preceding_context_len: int = 3, following_context_len: int = 3, remove_page_headers: bool = False, remove_page_footers: bool = False, remove_table_of_contents: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True) +def __init__(parsr_url: str = "http://localhost:3001", + extractor: Literal["pdfminer", "pdfjs"] = "pdfminer", + table_detection_mode: Literal["lattice", "stream"] = "lattice", + preceding_context_len: int = 3, + following_context_len: int = 3, + remove_page_headers: bool = False, + remove_page_footers: bool = False, + remove_table_of_contents: bool = False, + valid_languages: Optional[List[str]] = None, + id_hash_keys: Optional[List[str]] = None, + add_page_number: bool = True) ``` **Arguments**: @@ -480,7 +538,12 @@ In this case the id will be generated by using the content and the defined metad #### ParsrConverter.convert ```python -def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, + meta: Optional[Dict[str, Any]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = "utf-8", + id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Extract text and tables from a PDF or DOCX using the open-source Parsr tool. @@ -529,7 +592,16 @@ https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/quick #### AzureConverter.\_\_init\_\_ ```python -def __init__(endpoint: str, credential_key: str, model_id: str = "prebuilt-document", valid_languages: Optional[List[str]] = None, save_json: bool = False, preceding_context_len: int = 3, following_context_len: int = 3, merge_multiple_column_headers: bool = True, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True) +def __init__(endpoint: str, + credential_key: str, + model_id: str = "prebuilt-document", + valid_languages: Optional[List[str]] = None, + save_json: bool = False, + preceding_context_len: int = 3, + following_context_len: int = 3, + merge_multiple_column_headers: bool = True, + id_hash_keys: Optional[List[str]] = None, + add_page_number: bool = True) ``` **Arguments**: @@ -564,7 +636,14 @@ In this case the id will be generated by using the content and the defined metad #### AzureConverter.convert ```python -def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None, pages: Optional[str] = None, known_language: Optional[str] = None) -> List[Document] +def convert(file_path: Path, + meta: Optional[Dict[str, Any]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = "utf-8", + id_hash_keys: Optional[List[str]] = None, + pages: Optional[str] = None, + known_language: Optional[str] = None) -> List[Document] ``` Extract text and tables from a PDF, JPEG, PNG, BMP or TIFF file using Azure's Form Recognizer service. @@ -596,7 +675,11 @@ See supported locales here: https://aka.ms/azsdk/formrecognizer/supportedlocales #### AzureConverter.convert\_azure\_json ```python -def convert_azure_json(file_path: Path, meta: Optional[Dict[str, Any]] = None, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert_azure_json( + file_path: Path, + meta: Optional[Dict[str, Any]] = None, + valid_languages: Optional[List[str]] = None, + id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Extract text and tables from the JSON output of Azure's Form Recognizer service. @@ -633,7 +716,10 @@ class TikaConverter(BaseConverter) #### TikaConverter.\_\_init\_\_ ```python -def __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None) +def __init__(tika_url: str = "http://localhost:9998/tika", + remove_numeric_tables: bool = False, + valid_languages: Optional[List[str]] = None, + id_hash_keys: Optional[List[str]] = None) ``` **Arguments**: @@ -659,7 +745,12 @@ In this case the id will be generated by using the content and the defined metad #### TikaConverter.convert ```python -def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = None, + id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` **Arguments**: @@ -703,7 +794,12 @@ class TextConverter(BaseConverter) #### TextConverter.convert ```python -def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = "utf-8", + id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Reads text from a txt file and executes optional preprocessing steps. diff --git a/docs/_src/api/api/generator.md b/docs/_src/api/api/generator.md index 58fbbd2b9..6d2353d18 100644 --- a/docs/_src/api/api/generator.md +++ b/docs/_src/api/api/generator.md @@ -18,7 +18,8 @@ Abstract class for Generators ```python @abstractmethod -def predict(query: str, documents: List[Document], top_k: Optional[int]) -> Dict +def predict(query: str, documents: List[Document], + top_k: Optional[int]) -> Dict ``` Abstract method to generate answers. @@ -38,7 +39,10 @@ Generated answers plus additional infos in a dict #### BaseGenerator.predict\_batch ```python -def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None) +def predict_batch(queries: List[str], + documents: Union[List[Document], List[List[Document]]], + top_k: Optional[int] = None, + batch_size: Optional[int] = None) ``` Generate the answer to the input queries. The generation will be conditioned on the supplied documents. @@ -138,7 +142,20 @@ i.e. the model can easily adjust to domain documents even after training has fin #### RAGenerator.\_\_init\_\_ ```python -def __init__(model_name_or_path: str = "facebook/rag-token-nq", model_version: Optional[str] = None, retriever: Optional[DensePassageRetriever] = None, generator_type: str = "token", top_k: int = 2, max_length: int = 200, min_length: int = 2, num_beams: int = 2, embed_title: bool = True, prefix: Optional[str] = None, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) +def __init__(model_name_or_path: str = "facebook/rag-token-nq", + model_version: Optional[str] = None, + retriever: Optional[DensePassageRetriever] = None, + generator_type: str = "token", + top_k: int = 2, + max_length: int = 200, + min_length: int = 2, + num_beams: int = 2, + embed_title: bool = True, + prefix: Optional[str] = None, + use_gpu: bool = True, + progress_bar: bool = True, + use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None) ``` Load a RAG model from Transformers along with passage_embedding_model. @@ -176,7 +193,9 @@ parameter is not used and a single cpu device is used for inference. #### RAGenerator.predict ```python -def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict +def predict(query: str, + documents: List[Document], + top_k: Optional[int] = None) -> Dict ``` Generate the answer to the input query. The generation will be conditioned on the supplied documents. @@ -266,7 +285,16 @@ the [Hugging Face Model Hub](https://huggingface.co/models?pipeline_tag=text2tex #### Seq2SeqGenerator.\_\_init\_\_ ```python -def __init__(model_name_or_path: str, input_converter: Optional[Callable] = None, top_k: int = 1, max_length: int = 200, min_length: int = 2, num_beams: int = 8, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) +def __init__(model_name_or_path: str, + input_converter: Optional[Callable] = None, + top_k: int = 1, + max_length: int = 200, + min_length: int = 2, + num_beams: int = 8, + use_gpu: bool = True, + progress_bar: bool = True, + use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None) ``` **Arguments**: @@ -298,7 +326,9 @@ parameter is not used and a single cpu device is used for inference. #### Seq2SeqGenerator.predict ```python -def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict +def predict(query: str, + documents: List[Document], + top_k: Optional[int] = None) -> Dict ``` Generate the answer to the input query. The generation will be conditioned on the supplied documents. @@ -338,7 +368,17 @@ on the [OpenAI API website](https://openai.com/api/). #### OpenAIAnswerGenerator.\_\_init\_\_ ```python -def __init__(api_key: str, model: str = "text-curie-001", max_tokens: int = 7, top_k: int = 5, temperature: int = 0, presence_penalty: float = -2.0, frequency_penalty: float = -2.0, examples_context: Optional[str] = None, examples: Optional[List] = None, stop_words: Optional[List] = None, progress_bar: bool = True) +def __init__(api_key: str, + model: str = "text-curie-001", + max_tokens: int = 7, + top_k: int = 5, + temperature: int = 0, + presence_penalty: float = -2.0, + frequency_penalty: float = -2.0, + examples_context: Optional[str] = None, + examples: Optional[List] = None, + stop_words: Optional[List] = None, + progress_bar: bool = True) ``` **Arguments**: @@ -374,7 +414,9 @@ If you don't provide it, the default from OpenAPI docs is used: ["\n", "<|endoft #### OpenAIAnswerGenerator.predict ```python -def predict(query: str, documents: List[Document], top_k: Optional[int] = None) +def predict(query: str, + documents: List[Document], + top_k: Optional[int] = None) ``` Use the loaded QA model to generate Answers for a query based on the Documents it receives. diff --git a/docs/_src/api/api/other_nodes.md b/docs/_src/api/api/other_nodes.md index e638a5701..3d6ef93dd 100644 --- a/docs/_src/api/api/other_nodes.md +++ b/docs/_src/api/api/other_nodes.md @@ -45,7 +45,10 @@ The node allows multiple join modes: #### JoinDocuments.\_\_init\_\_ ```python -def __init__(join_mode: str = "concatenate", weights: Optional[List[float]] = None, top_k_join: Optional[int] = None, sort_by_score: bool = True) +def __init__(join_mode: str = "concatenate", + weights: Optional[List[float]] = None, + top_k_join: Optional[int] = None, + sort_by_score: bool = True) ``` **Arguments**: @@ -79,7 +82,10 @@ A node to join `Answer`s produced by multiple `Reader` nodes. #### JoinAnswers.\_\_init\_\_ ```python -def __init__(join_mode: str = "concatenate", weights: Optional[List[float]] = None, top_k_join: Optional[int] = None, sort_by_score: bool = True) +def __init__(join_mode: str = "concatenate", + weights: Optional[List[float]] = None, + top_k_join: Optional[int] = None, + sort_by_score: bool = True) ``` **Arguments**: @@ -114,7 +120,8 @@ different nodes. #### RouteDocuments.\_\_init\_\_ ```python -def __init__(split_by: str = "content_type", metadata_values: Optional[List[str]] = None) +def __init__(split_by: str = "content_type", + metadata_values: Optional[List[str]] = None) ``` **Arguments**: diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md index ad96fc821..7fe239cf9 100644 --- a/docs/_src/api/api/pipelines.md +++ b/docs/_src/api/api/pipelines.md @@ -42,7 +42,9 @@ Note that this also includes such components that are being utilized by other co #### Pipeline.to\_code ```python -def to_code(pipeline_variable_name: str = "pipeline", generate_imports: bool = True, add_comment: bool = False) -> str +def to_code(pipeline_variable_name: str = "pipeline", + generate_imports: bool = True, + add_comment: bool = False) -> str ``` Returns the code to create this pipeline as string. @@ -61,7 +63,9 @@ Default value is False. #### Pipeline.to\_notebook\_cell ```python -def to_notebook_cell(pipeline_variable_name: str = "pipeline", generate_imports: bool = True, add_comment: bool = True) +def to_notebook_cell(pipeline_variable_name: str = "pipeline", + generate_imports: bool = True, + add_comment: bool = True) ``` Creates a new notebook cell with the code to create this pipeline. @@ -81,7 +85,13 @@ Default value is True. ```python @classmethod -def load_from_deepset_cloud(cls, pipeline_config_name: str, pipeline_name: str = "query", workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, overwrite_with_env_variables: bool = False) +def load_from_deepset_cloud(cls, + pipeline_config_name: str, + pipeline_name: str = "query", + workspace: str = "default", + api_key: Optional[str] = None, + api_endpoint: Optional[str] = None, + overwrite_with_env_variables: bool = False) ``` Load Pipeline from Deepset Cloud defining the individual components and how they're tied together to form @@ -114,7 +124,11 @@ variable 'READER_PARAMS_RETURN_NO_ANSWER=False' can be set. Note that an ```python @classmethod -def list_pipelines_on_deepset_cloud(cls, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None) -> List[dict] +def list_pipelines_on_deepset_cloud( + cls, + workspace: str = "default", + api_key: Optional[str] = None, + api_endpoint: Optional[str] = None) -> List[dict] ``` Lists all pipeline configs available on Deepset Cloud. @@ -150,7 +164,14 @@ Returns: ```python @classmethod -def save_to_deepset_cloud(cls, query_pipeline: Pipeline, index_pipeline: Pipeline, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, overwrite: bool = False) +def save_to_deepset_cloud(cls, + query_pipeline: Pipeline, + index_pipeline: Pipeline, + pipeline_config_name: str, + workspace: str = "default", + api_key: Optional[str] = None, + api_endpoint: Optional[str] = None, + overwrite: bool = False) ``` Saves a Pipeline config to Deepset Cloud defining the individual components and how they're tied together to form @@ -175,7 +196,13 @@ If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment varia ```python @classmethod -def deploy_on_deepset_cloud(cls, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, timeout: int = 60, show_curl_message: bool = True) +def deploy_on_deepset_cloud(cls, + pipeline_config_name: str, + workspace: str = "default", + api_key: Optional[str] = None, + api_endpoint: Optional[str] = None, + timeout: int = 60, + show_curl_message: bool = True) ``` Deploys the pipelines of a pipeline config on Deepset Cloud. @@ -205,7 +232,12 @@ If the timeout is exceeded an error will be raised. ```python @classmethod -def undeploy_on_deepset_cloud(cls, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, timeout: int = 60) +def undeploy_on_deepset_cloud(cls, + pipeline_config_name: str, + workspace: str = "default", + api_key: Optional[str] = None, + api_endpoint: Optional[str] = None, + timeout: int = 60) ``` Undeploys the pipelines of a pipeline config on Deepset Cloud. @@ -285,7 +317,13 @@ Set the component for a node in the Pipeline. #### Pipeline.run ```python -def run(query: Optional[str] = None, file_paths: Optional[List[str]] = None, labels: Optional[MultiLabel] = None, documents: Optional[List[Document]] = None, meta: Optional[Union[dict, List[dict]]] = None, params: Optional[dict] = None, debug: Optional[bool] = None) +def run(query: Optional[str] = None, + file_paths: Optional[List[str]] = None, + labels: Optional[MultiLabel] = None, + documents: Optional[List[Document]] = None, + meta: Optional[Union[dict, List[dict]]] = None, + params: Optional[dict] = None, + debug: Optional[bool] = None) ``` Runs the Pipeline, one node at a time. @@ -310,7 +348,15 @@ the Nodes received and the output they generated. You can then find all debug in #### Pipeline.run\_batch ```python -def run_batch(queries: List[str] = None, file_paths: Optional[List[str]] = None, labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None, documents: Optional[Union[List[Document], List[List[Document]]]] = None, meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, params: Optional[dict] = None, debug: Optional[bool] = None) +def run_batch(queries: List[str] = None, + file_paths: Optional[List[str]] = None, + labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None, + documents: Optional[Union[List[Document], + List[List[Document]]]] = None, + meta: Optional[Union[Dict[str, Any], List[Dict[str, + Any]]]] = None, + params: Optional[dict] = None, + debug: Optional[bool] = None) ``` Runs the Pipeline in a batch mode, one node at a time. The batch mode means that the Pipeline can take more than one query as input. You can use this method for query pipelines only. When used with an indexing pipeline, it calls the pipeline `run()` method. @@ -346,7 +392,18 @@ the Nodes received and the output they generated. You can then find all debug in ```python @classmethod -def eval_beir(cls, index_pipeline: Pipeline, query_pipeline: Pipeline, index_params: dict = {}, query_params: dict = {}, dataset: str = "scifact", dataset_dir: Path = Path("."), top_k_values: List[int] = [1, 3, 5, 10, 100, 1000], keep_index: bool = False) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]] +def eval_beir( + cls, + index_pipeline: Pipeline, + query_pipeline: Pipeline, + index_params: dict = {}, + query_params: dict = {}, + dataset: str = "scifact", + dataset_dir: Path = Path("."), + top_k_values: List[int] = [1, 3, 5, 10, 100, 1000], + keep_index: bool = False +) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, + float]] ``` Runs information retrieval evaluation of a pipeline using BEIR on a specified BEIR dataset. @@ -375,14 +432,38 @@ Each metric is represented by a dictionary containing the scores for each top_k ```python @classmethod -def execute_eval_run(cls, index_pipeline: Pipeline, query_pipeline: Pipeline, evaluation_set_labels: List[MultiLabel], corpus_file_paths: List[str], experiment_name: str, experiment_run_name: str, experiment_tracking_tool: Literal["mlflow", None] = None, experiment_tracking_uri: Optional[str] = None, corpus_file_metas: List[Dict[str, Any]] = None, corpus_meta: Dict[str, Any] = {}, evaluation_set_meta: Dict[str, Any] = {}, pipeline_meta: Dict[str, Any] = {}, index_params: dict = {}, query_params: dict = {}, sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, use_batch_mode: bool = False, add_isolated_node_eval: bool = False, reuse_index: bool = False, custom_document_id_field: Optional[str] = None, document_scope: Literal[ - "document_id", - "context", - "document_id_and_context", - "document_id_or_context", - "answer", - "document_id_or_answer", - ] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult +def execute_eval_run( + cls, + index_pipeline: Pipeline, + query_pipeline: Pipeline, + evaluation_set_labels: List[MultiLabel], + corpus_file_paths: List[str], + experiment_name: str, + experiment_run_name: str, + experiment_tracking_tool: Literal["mlflow", None] = None, + experiment_tracking_uri: Optional[str] = None, + corpus_file_metas: List[Dict[str, Any]] = None, + corpus_meta: Dict[str, Any] = {}, + evaluation_set_meta: Dict[str, Any] = {}, + pipeline_meta: Dict[str, Any] = {}, + index_params: dict = {}, + query_params: dict = {}, + sas_model_name_or_path: str = None, + sas_batch_size: int = 32, + sas_use_gpu: bool = True, + use_batch_mode: bool = False, + add_isolated_node_eval: bool = False, + reuse_index: bool = False, + custom_document_id_field: Optional[str] = None, + document_scope: Literal[ + "document_id", "context", "document_id_and_context", + "document_id_or_context", "answer", + "document_id_or_answer", ] = "document_id_or_answer", + answer_scope: Literal["any", "context", "document_id", + "document_id_and_context"] = "any", + context_matching_min_length: int = 100, + context_matching_boost_split_overlaps: bool = True, + context_matching_threshold: float = 65.0) -> EvaluationResult ``` Starts an experiment run that first indexes the specified files (forming a corpus) using the index pipeline @@ -510,7 +591,19 @@ Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scori ```python @send_event -def eval(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0, use_auth_token: Optional[Union[str, bool]] = None) -> EvaluationResult +def eval( + labels: List[MultiLabel], + documents: Optional[List[List[Document]]] = None, + params: Optional[dict] = None, + sas_model_name_or_path: Optional[str] = None, + sas_batch_size: int = 32, + sas_use_gpu: bool = True, + add_isolated_node_eval: bool = False, + custom_document_id_field: Optional[str] = None, + context_matching_min_length: int = 100, + context_matching_boost_split_overlaps: bool = True, + context_matching_threshold: float = 65.0, + use_auth_token: Optional[Union[str, bool]] = None) -> EvaluationResult ``` Evaluates the pipeline by running the pipeline once per query in debug mode @@ -576,7 +669,19 @@ https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrai ```python @send_event -def eval_batch(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0, use_auth_token: Optional[Union[str, bool]] = None) -> EvaluationResult +def eval_batch( + labels: List[MultiLabel], + documents: Optional[List[List[Document]]] = None, + params: Optional[dict] = None, + sas_model_name_or_path: Optional[str] = None, + sas_batch_size: int = 32, + sas_use_gpu: bool = True, + add_isolated_node_eval: bool = False, + custom_document_id_field: Optional[str] = None, + context_matching_min_length: int = 100, + context_matching_boost_split_overlaps: bool = True, + context_matching_threshold: float = 65.0, + use_auth_token: Optional[Union[str, bool]] = None) -> EvaluationResult ``` Evaluates the pipeline by running it in batches in the debug mode @@ -690,7 +795,11 @@ Create a Graphviz visualization of the pipeline. ```python @classmethod -def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, strict_version_check: bool = False) +def load_from_yaml(cls, + path: Path, + pipeline_name: Optional[str] = None, + overwrite_with_env_variables: bool = True, + strict_version_check: bool = False) ``` Load Pipeline from a YAML file defining the individual components and how they're tied together to form @@ -747,7 +856,11 @@ variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an ```python @classmethod -def load_from_config(cls, pipeline_config: Dict, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, strict_version_check: bool = False) +def load_from_config(cls, + pipeline_config: Dict, + pipeline_name: Optional[str] = None, + overwrite_with_env_variables: bool = True, + strict_version_check: bool = False) ``` Load Pipeline from a config dict defining the individual components and how they're tied together to form @@ -832,14 +945,19 @@ Returns a configuration for the Pipeline that can be used with `Pipeline.load_fr #### Pipeline.print\_eval\_report ```python -def print_eval_report(eval_result: EvaluationResult, n_wrong_examples: int = 3, metrics_filter: Optional[Dict[str, List[str]]] = None, document_scope: Literal[ - "document_id", - "context", - "document_id_and_context", - "document_id_or_context", - "answer", - "document_id_or_answer", - ] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", wrong_examples_fields: List[str] = ["answer", "context", "document_id"], max_characters_per_field: int = 150) +def print_eval_report(eval_result: EvaluationResult, + n_wrong_examples: int = 3, + metrics_filter: Optional[Dict[str, List[str]]] = None, + document_scope: Literal[ + "document_id", "context", "document_id_and_context", + "document_id_or_context", "answer", + "document_id_or_answer", ] = "document_id_or_answer", + answer_scope: Literal["any", "context", "document_id", + "document_id_and_context"] = "any", + wrong_examples_fields: List[str] = [ + "answer", "context", "document_id" + ], + max_characters_per_field: int = 150) ``` Prints evaluation report containing a metrics funnel and worst queries for further analysis. @@ -892,7 +1010,8 @@ class _HaystackBeirRetrieverAdapter() #### \_HaystackBeirRetrieverAdapter.\_\_init\_\_ ```python -def __init__(index_pipeline: Pipeline, query_pipeline: Pipeline, index_params: dict, query_params: dict) +def __init__(index_pipeline: Pipeline, query_pipeline: Pipeline, + index_params: dict, query_params: dict) ``` Adapter mimicking a BEIR retriever used by BEIR's EvaluateRetrieval class to run BEIR evaluations on Haystack Pipelines. @@ -959,7 +1078,9 @@ YAML definitions of Ray pipelines are validated at load. For more information, s #### RayPipeline.\_\_init\_\_ ```python -def __init__(address: str = None, ray_args: Optional[Dict[str, Any]] = None, serve_args: Optional[Dict[str, Any]] = None) +def __init__(address: str = None, + ray_args: Optional[Dict[str, Any]] = None, + serve_args: Optional[Dict[str, Any]] = None) ``` **Arguments**: @@ -974,7 +1095,14 @@ def __init__(address: str = None, ray_args: Optional[Dict[str, Any]] = None, ser ```python @classmethod -def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, address: Optional[str] = None, strict_version_check: bool = False, ray_args: Optional[Dict[str, Any]] = None, serve_args: Optional[Dict[str, Any]] = None) +def load_from_yaml(cls, + path: Path, + pipeline_name: Optional[str] = None, + overwrite_with_env_variables: bool = True, + address: Optional[str] = None, + strict_version_check: bool = False, + ray_args: Optional[Dict[str, Any]] = None, + serve_args: Optional[Dict[str, Any]] = None) ``` Load Pipeline from a YAML file defining the individual components and how they're tied together to form @@ -1189,7 +1317,10 @@ Save a YAML configuration for the Pipeline that can be used with `Pipeline.load_ ```python @classmethod -def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True) +def load_from_yaml(cls, + path: Path, + pipeline_name: Optional[str] = None, + overwrite_with_env_variables: bool = True) ``` Load Pipeline from a YAML file defining the individual components and how they're tied together to form @@ -1277,7 +1408,16 @@ Instance of DocumentStore or None #### BaseStandardPipeline.eval ```python -def eval(labels: List[MultiLabel], params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult +def eval(labels: List[MultiLabel], + params: Optional[dict] = None, + sas_model_name_or_path: Optional[str] = None, + sas_batch_size: int = 32, + sas_use_gpu: bool = True, + add_isolated_node_eval: bool = False, + custom_document_id_field: Optional[str] = None, + context_matching_min_length: int = 100, + context_matching_boost_split_overlaps: bool = True, + context_matching_threshold: float = 65.0) -> EvaluationResult ``` Evaluates the pipeline by running the pipeline once per query in debug mode @@ -1318,7 +1458,16 @@ Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scori #### BaseStandardPipeline.eval\_batch ```python -def eval_batch(labels: List[MultiLabel], params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult +def eval_batch(labels: List[MultiLabel], + params: Optional[dict] = None, + sas_model_name_or_path: Optional[str] = None, + sas_batch_size: int = 32, + sas_use_gpu: bool = True, + add_isolated_node_eval: bool = False, + custom_document_id_field: Optional[str] = None, + context_matching_min_length: int = 100, + context_matching_boost_split_overlaps: bool = True, + context_matching_threshold: float = 65.0) -> EvaluationResult ``` Evaluates the pipeline by running the pipeline once per query in the debug mode @@ -1358,14 +1507,19 @@ To calculate SAS (Semantic Answer Similarity) metrics, specify `sas_model_name_o #### BaseStandardPipeline.print\_eval\_report ```python -def print_eval_report(eval_result: EvaluationResult, n_wrong_examples: int = 3, metrics_filter: Optional[Dict[str, List[str]]] = None, document_scope: Literal[ - "document_id", - "context", - "document_id_and_context", - "document_id_or_context", - "answer", - "document_id_or_answer", - ] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", wrong_examples_fields: List[str] = ["answer", "context", "document_id"], max_characters_per_field: int = 150) +def print_eval_report(eval_result: EvaluationResult, + n_wrong_examples: int = 3, + metrics_filter: Optional[Dict[str, List[str]]] = None, + document_scope: Literal[ + "document_id", "context", "document_id_and_context", + "document_id_or_context", "answer", + "document_id_or_answer", ] = "document_id_or_answer", + answer_scope: Literal["any", "context", "document_id", + "document_id_and_context"] = "any", + wrong_examples_fields: List[str] = [ + "answer", "context", "document_id" + ], + max_characters_per_field: int = 150) ``` Prints evaluation report containing a metrics funnel and worst queries for further analysis. @@ -1410,7 +1564,9 @@ In Question Answering, to enforce that the retrieved document is considered corr #### BaseStandardPipeline.run\_batch ```python -def run_batch(queries: List[str], params: Optional[dict] = None, debug: Optional[bool] = None) +def run_batch(queries: List[str], + params: Optional[dict] = None, + debug: Optional[bool] = None) ``` Run a batch of queries through the pipeline. @@ -1454,7 +1610,9 @@ def __init__(reader: BaseReader, retriever: BaseRetriever) #### ExtractiveQAPipeline.run ```python -def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None) +def run(query: str, + params: Optional[dict] = None, + debug: Optional[bool] = None) ``` **Arguments**: @@ -1495,7 +1653,9 @@ def __init__(retriever: BaseRetriever) #### DocumentSearchPipeline.run ```python -def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None) +def run(query: str, + params: Optional[dict] = None, + debug: Optional[bool] = None) ``` **Arguments**: @@ -1536,7 +1696,9 @@ def __init__(generator: BaseGenerator, retriever: BaseRetriever) #### GenerativeQAPipeline.run ```python -def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None) +def run(query: str, + params: Optional[dict] = None, + debug: Optional[bool] = None) ``` **Arguments**: @@ -1565,7 +1727,9 @@ Pipeline that retrieves documents for a query and then summarizes those document #### SearchSummarizationPipeline.\_\_init\_\_ ```python -def __init__(summarizer: BaseSummarizer, retriever: BaseRetriever, return_in_answer_format: bool = False) +def __init__(summarizer: BaseSummarizer, + retriever: BaseRetriever, + return_in_answer_format: bool = False) ``` **Arguments**: @@ -1581,7 +1745,9 @@ pipeline as a "drop-in replacement" for other QA pipelines. #### SearchSummarizationPipeline.run ```python -def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None) +def run(query: str, + params: Optional[dict] = None, + debug: Optional[bool] = None) ``` **Arguments**: @@ -1600,7 +1766,9 @@ by this method under the key "_debug" #### SearchSummarizationPipeline.run\_batch ```python -def run_batch(queries: List[str], params: Optional[dict] = None, debug: Optional[bool] = None) +def run_batch(queries: List[str], + params: Optional[dict] = None, + debug: Optional[bool] = None) ``` Run a batch of queries through the pipeline. @@ -1643,7 +1811,9 @@ def __init__(retriever: BaseRetriever) #### FAQPipeline.run ```python -def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None) +def run(query: str, + params: Optional[dict] = None, + debug: Optional[bool] = None) ``` **Arguments**: @@ -1672,7 +1842,9 @@ Takes an existing search pipeline and adds one "input translation node" after th #### TranslationWrapperPipeline.\_\_init\_\_ ```python -def __init__(input_translator: BaseTranslator, output_translator: BaseTranslator, pipeline: BaseStandardPipeline) +def __init__(input_translator: BaseTranslator, + output_translator: BaseTranslator, + pipeline: BaseStandardPipeline) ``` Wrap a given `pipeline` with the `input_translator` and `output_translator`. diff --git a/docs/_src/api/api/preprocessor.md b/docs/_src/api/api/preprocessor.md index bcdd93534..4a10d0a5a 100644 --- a/docs/_src/api/api/preprocessor.md +++ b/docs/_src/api/api/preprocessor.md @@ -16,7 +16,16 @@ class BasePreProcessor(BaseComponent) ```python @abstractmethod -def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True, id_hash_keys: Optional[List[str]] = None) -> List[Document] +def process(documents: Union[dict, Document, List[Union[dict, Document]]], + clean_whitespace: Optional[bool] = True, + clean_header_footer: Optional[bool] = False, + clean_empty_lines: Optional[bool] = True, + remove_substrings: List[str] = [], + split_by: Optional[str] = "word", + split_length: Optional[int] = 1000, + split_overlap: Optional[int] = None, + split_respect_sentence_boundary: Optional[bool] = True, + id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a @@ -39,7 +48,19 @@ class PreProcessor(BasePreProcessor) #### PreProcessor.\_\_init\_\_ ```python -def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, tokenizer_model_folder: Optional[Union[str, Path]] = None, language: str = "en", id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True, add_page_number: bool = False) +def __init__(clean_whitespace: bool = True, + clean_header_footer: bool = False, + clean_empty_lines: bool = True, + remove_substrings: List[str] = [], + split_by: str = "word", + split_length: int = 200, + split_overlap: int = 0, + split_respect_sentence_boundary: bool = True, + tokenizer_model_folder: Optional[Union[str, Path]] = None, + language: str = "en", + id_hash_keys: Optional[List[str]] = None, + progress_bar: bool = True, + add_page_number: bool = False) ``` **Arguments**: @@ -80,7 +101,16 @@ in between pages by `PDFToTextConverter`, `TikaConverter`, `ParsrConverter` and #### PreProcessor.process ```python -def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, remove_substrings: List[str] = [], split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +def process(documents: Union[dict, Document, List[Union[dict, Document]]], + clean_whitespace: Optional[bool] = None, + clean_header_footer: Optional[bool] = None, + clean_empty_lines: Optional[bool] = None, + remove_substrings: List[str] = [], + split_by: Optional[str] = None, + split_length: Optional[int] = None, + split_overlap: Optional[int] = None, + split_respect_sentence_boundary: Optional[bool] = None, + id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents. @@ -90,7 +120,12 @@ Perform document cleaning and splitting. Can take a single document or a list of #### PreProcessor.clean ```python -def clean(document: Union[dict, Document], clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool, remove_substrings: List[str], id_hash_keys: Optional[List[str]] = None) -> Document +def clean(document: Union[dict, Document], + clean_whitespace: bool, + clean_header_footer: bool, + clean_empty_lines: bool, + remove_substrings: List[str], + id_hash_keys: Optional[List[str]] = None) -> Document ``` Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers @@ -101,7 +136,12 @@ and empty lines. Its exact functionality is defined by the parameters passed int #### PreProcessor.split ```python -def split(document: Union[dict, Document], split_by: str, split_length: int, split_overlap: int, split_respect_sentence_boundary: bool, id_hash_keys: Optional[List[str]] = None) -> List[Document] +def split(document: Union[dict, Document], + split_by: str, + split_length: int, + split_overlap: int, + split_respect_sentence_boundary: bool, + id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Perform document splitting on a single document. This method can split on different units, at different lengths, diff --git a/docs/_src/api/api/primitives.md b/docs/_src/api/api/primitives.md index 8be8ee27e..28270f6dc 100644 --- a/docs/_src/api/api/primitives.md +++ b/docs/_src/api/api/primitives.md @@ -16,7 +16,13 @@ class Document() #### Document.\_\_init\_\_ ```python -def __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image", "audio"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None) +def __init__(content: Union[str, pd.DataFrame], + content_type: Literal["text", "table", "image", "audio"] = "text", + id: Optional[str] = None, + score: Optional[float] = None, + meta: Dict[str, Any] = None, + embedding: Optional[np.ndarray] = None, + id_hash_keys: Optional[List[str]] = None) ``` One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack. @@ -79,7 +85,10 @@ dict with content of the Document ```python @classmethod -def from_dict(cls, dict: Dict[str, Any], field_map: Dict[str, Any] = {}, id_hash_keys: Optional[List[str]] = None) -> Document +def from_dict(cls, + dict: Dict[str, Any], + field_map: Dict[str, Any] = {}, + id_hash_keys: Optional[List[str]] = None) -> Document ``` Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the @@ -229,7 +238,19 @@ class Label() #### Label.\_\_init\_\_ ```python -def __init__(query: str, document: Document, is_correct_answer: bool, is_correct_document: bool, origin: Literal["user-feedback", "gold-label"], answer: Optional[Answer], id: Optional[str] = None, no_answer: Optional[bool] = None, pipeline_id: Optional[str] = None, created_at: Optional[str] = None, updated_at: Optional[str] = None, meta: Optional[dict] = None, filters: Optional[dict] = None) +def __init__(query: str, + document: Document, + is_correct_answer: bool, + is_correct_document: bool, + origin: Literal["user-feedback", "gold-label"], + answer: Optional[Answer], + id: Optional[str] = None, + no_answer: Optional[bool] = None, + pipeline_id: Optional[str] = None, + created_at: Optional[str] = None, + updated_at: Optional[str] = None, + meta: Optional[dict] = None, + filters: Optional[dict] = None) ``` Object used to represent label/feedback in a standardized way within Haystack. @@ -272,7 +293,10 @@ class MultiLabel() #### MultiLabel.\_\_init\_\_ ```python -def __init__(labels: List[Label], drop_negative_labels=False, drop_no_answers=False, **kwargs) +def __init__(labels: List[Label], + drop_negative_labels=False, + drop_no_answers=False, + **kwargs) ``` There are often multiple `Labels` associated with a single query. For example, there can be multiple annotated @@ -382,14 +406,17 @@ The DataFrames have the following schema: #### EvaluationResult.calculate\_metrics ```python -def calculate_metrics(simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, document_scope: Literal[ - "document_id", - "context", - "document_id_and_context", - "document_id_or_context", - "answer", - "document_id_or_answer", - ] = "document_id_or_answer", eval_mode: Literal["integrated", "isolated"] = "integrated", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any") -> Dict[str, Dict[str, float]] +def calculate_metrics( + simulated_top_k_reader: int = -1, + simulated_top_k_retriever: int = -1, + document_scope: Literal[ + "document_id", "context", "document_id_and_context", + "document_id_or_context", "answer", + "document_id_or_answer", ] = "document_id_or_answer", + eval_mode: Literal["integrated", "isolated"] = "integrated", + answer_scope: Literal["any", "context", "document_id", + "document_id_and_context"] = "any" +) -> Dict[str, Dict[str, float]] ``` Calculates proper metrics for each node. @@ -457,14 +484,23 @@ In Question Answering, to enforce that the retrieved document is considered corr #### EvaluationResult.wrong\_examples ```python -def wrong_examples(node: str, n: int = 3, simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, document_scope: Literal[ - "document_id", - "context", - "document_id_and_context", - "document_id_or_context", - "answer", - "document_id_or_answer", - ] = "document_id_or_answer", document_metric: str = "recall_single_hit", answer_metric: str = "f1", document_metric_threshold: float = 0.5, answer_metric_threshold: float = 0.5, eval_mode: Literal["integrated", "isolated"] = "integrated", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any") -> List[Dict] +def wrong_examples( + node: str, + n: int = 3, + simulated_top_k_reader: int = -1, + simulated_top_k_retriever: int = -1, + document_scope: Literal[ + "document_id", "context", "document_id_and_context", + "document_id_or_context", "answer", + "document_id_or_answer", ] = "document_id_or_answer", + document_metric: str = "recall_single_hit", + answer_metric: str = "f1", + document_metric_threshold: float = 0.5, + answer_metric_threshold: float = 0.5, + eval_mode: Literal["integrated", "isolated"] = "integrated", + answer_scope: Literal["any", "context", "document_id", + "document_id_and_context"] = "any" +) -> List[Dict] ``` Returns the worst performing queries. diff --git a/docs/_src/api/api/pseudo_label_generator.md b/docs/_src/api/api/pseudo_label_generator.md index 757f14b5f..d0d62e807 100644 --- a/docs/_src/api/api/pseudo_label_generator.md +++ b/docs/_src/api/api/pseudo_label_generator.md @@ -53,7 +53,17 @@ For example: #### PseudoLabelGenerator.\_\_init\_\_ ```python -def __init__(question_producer: Union[QuestionGenerator, List[Dict[str, str]]], retriever: BaseRetriever, cross_encoder_model_name_or_path: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", max_questions_per_document: int = 3, top_k: int = 50, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None) +def __init__(question_producer: Union[QuestionGenerator, List[Dict[str, str]]], + retriever: BaseRetriever, + cross_encoder_model_name_or_path: + str = "cross-encoder/ms-marco-MiniLM-L-6-v2", + max_questions_per_document: int = 3, + top_k: int = 50, + batch_size: int = 16, + progress_bar: bool = True, + use_auth_token: Optional[Union[str, bool]] = None, + use_gpu: bool = True, + devices: Optional[List[Union[str, torch.device]]] = None) ``` Loads the cross-encoder model and prepares PseudoLabelGenerator. @@ -84,7 +94,9 @@ parameter is not used and a single cpu device is used for inference. #### PseudoLabelGenerator.generate\_questions ```python -def generate_questions(documents: List[Document], batch_size: Optional[int] = None) -> List[Dict[str, str]] +def generate_questions( + documents: List[Document], + batch_size: Optional[int] = None) -> List[Dict[str, str]] ``` It takes a list of documents and generates a list of question-document pairs. @@ -103,7 +115,8 @@ A list of question-document pairs. #### PseudoLabelGenerator.mine\_negatives ```python -def mine_negatives(question_doc_pairs: List[Dict[str, str]], batch_size: Optional[int] = None) -> List[Dict[str, str]] +def mine_negatives(question_doc_pairs: List[Dict[str, str]], + batch_size: Optional[int] = None) -> List[Dict[str, str]] ``` Given a list of question and positive document pairs, this function returns a list of question/positive document/negative document @@ -125,7 +138,8 @@ and negative document. #### PseudoLabelGenerator.generate\_margin\_scores ```python -def generate_margin_scores(mined_negatives: List[Dict[str, str]], batch_size: Optional[int] = None) -> List[Dict] +def generate_margin_scores(mined_negatives: List[Dict[str, str]], + batch_size: Optional[int] = None) -> List[Dict] ``` Given a list of mined negatives, this function predicts the score margin between the positive and negative document using @@ -157,7 +171,9 @@ A list of dictionaries, each of which has the following keys: #### PseudoLabelGenerator.generate\_pseudo\_labels ```python -def generate_pseudo_labels(documents: List[Document], batch_size: Optional[int] = None) -> Tuple[dict, str] +def generate_pseudo_labels( + documents: List[Document], + batch_size: Optional[int] = None) -> Tuple[dict, str] ``` Given a list of documents, this function generates a list of question-document pairs, mines for negatives, and diff --git a/docs/_src/api/api/query_classifier.md b/docs/_src/api/api/query_classifier.md index 45df8f2c2..de7938418 100644 --- a/docs/_src/api/api/query_classifier.md +++ b/docs/_src/api/api/query_classifier.md @@ -69,11 +69,17 @@ and the further processing can be customized. You can define this by connecting #### SklearnQueryClassifier.\_\_init\_\_ ```python -def __init__(model_name_or_path: Union[ - str, Any - ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", vectorizer_name_or_path: Union[ - str, Any - ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle", batch_size: Optional[int] = None, progress_bar: bool = True) +def __init__( + model_name_or_path: + Union[ + str, + Any] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", + vectorizer_name_or_path: + Union[ + str, + Any] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle", + batch_size: Optional[int] = None, + progress_bar: bool = True) ``` **Arguments**: @@ -144,7 +150,17 @@ This node also supports zero-shot-classification. #### TransformersQueryClassifier.\_\_init\_\_ ```python -def __init__(model_name_or_path: Union[Path, str] = "shahrukhx01/bert-mini-finetune-question-detection", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, task: str = "text-classification", labels: List[str] = DEFAULT_LABELS, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) +def __init__(model_name_or_path: Union[ + Path, str] = "shahrukhx01/bert-mini-finetune-question-detection", + model_version: Optional[str] = None, + tokenizer: Optional[str] = None, + use_gpu: bool = True, + task: str = "text-classification", + labels: List[str] = DEFAULT_LABELS, + batch_size: int = 16, + progress_bar: bool = True, + use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None) ``` **Arguments**: diff --git a/docs/_src/api/api/question_generator.md b/docs/_src/api/api/question_generator.md index c5bfc3236..becdfa740 100644 --- a/docs/_src/api/api/question_generator.md +++ b/docs/_src/api/api/question_generator.md @@ -23,7 +23,23 @@ come from earlier in the document. #### QuestionGenerator.\_\_init\_\_ ```python -def __init__(model_name_or_path="valhalla/t5-base-e2e-qg", model_version=None, num_beams=4, max_length=256, no_repeat_ngram_size=3, length_penalty=1.5, early_stopping=True, split_length=50, split_overlap=10, use_gpu=True, prompt="generate questions:", num_queries_per_doc=1, sep_token: str = "", batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) +def __init__(model_name_or_path="valhalla/t5-base-e2e-qg", + model_version=None, + num_beams=4, + max_length=256, + no_repeat_ngram_size=3, + length_penalty=1.5, + early_stopping=True, + split_length=50, + split_overlap=10, + use_gpu=True, + prompt="generate questions:", + num_queries_per_doc=1, + sep_token: str = "", + batch_size: int = 16, + progress_bar: bool = True, + use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None) ``` Uses the valhalla/t5-base-e2e-qg model by default. This class supports any question generation model that is @@ -55,7 +71,10 @@ parameter is not used and a single cpu device is used for inference. #### QuestionGenerator.generate\_batch ```python -def generate_batch(texts: Union[List[str], List[List[str]]], batch_size: Optional[int] = None) -> Union[List[List[str]], List[List[List[str]]]] +def generate_batch( + texts: Union[List[str], List[List[str]]], + batch_size: Optional[int] = None +) -> Union[List[List[str]], List[List[List[str]]]] ``` Generates questions for a list of strings or a list of lists of strings. diff --git a/docs/_src/api/api/ranker.md b/docs/_src/api/api/ranker.md index 41e1788b0..0161b1736 100644 --- a/docs/_src/api/api/ranker.md +++ b/docs/_src/api/api/ranker.md @@ -25,7 +25,12 @@ Wrapper method used to time functions. #### BaseRanker.eval ```python -def eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False) -> dict +def eval(label_index: str = "label", + doc_index: str = "eval_document", + label_origin: str = "gold_label", + top_k: int = 10, + open_domain: bool = False, + return_preds: bool = False) -> dict ``` Performs evaluation of the Ranker. @@ -94,7 +99,15 @@ Usage example: #### SentenceTransformersRanker.\_\_init\_\_ ```python -def __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, top_k: int = 10, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, batch_size: int = 16, scale_score: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None) +def __init__(model_name_or_path: Union[str, Path], + model_version: Optional[str] = None, + top_k: int = 10, + use_gpu: bool = True, + devices: Optional[List[Union[str, torch.device]]] = None, + batch_size: int = 16, + scale_score: bool = True, + progress_bar: bool = True, + use_auth_token: Optional[Union[str, bool]] = None) ``` **Arguments**: @@ -125,7 +138,9 @@ parameter is not used and a single cpu device is used for inference. #### SentenceTransformersRanker.predict ```python -def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> List[Document] +def predict(query: str, + documents: List[Document], + top_k: Optional[int] = None) -> List[Document] ``` Use loaded ranker model to re-rank the supplied list of Document. @@ -147,7 +162,12 @@ List of Document #### SentenceTransformersRanker.predict\_batch ```python -def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None) -> Union[List[Document], List[List[Document]]] +def predict_batch( + queries: List[str], + documents: Union[List[Document], List[List[Document]]], + top_k: Optional[int] = None, + batch_size: Optional[int] = None +) -> Union[List[Document], List[List[Document]]] ``` Use loaded ranker model to re-rank the supplied lists of Documents. diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md index a41b027b7..f142e8d75 100644 --- a/docs/_src/api/api/reader.md +++ b/docs/_src/api/api/reader.md @@ -45,7 +45,28 @@ While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interf #### FARMReader.\_\_init\_\_ ```python -def __init__(model_name_or_path: str, model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True, confidence_threshold: Optional[float] = None, proxies: Optional[Dict[str, str]] = None, local_files_only=False, force_download=False, use_auth_token: Optional[Union[str, bool]] = None) +def __init__(model_name_or_path: str, + model_version: Optional[str] = None, + context_window_size: int = 150, + batch_size: int = 50, + use_gpu: bool = True, + devices: Optional[List[Union[str, torch.device]]] = None, + no_ans_boost: float = 0.0, + return_no_answer: bool = False, + top_k: int = 10, + top_k_per_candidate: int = 3, + top_k_per_sample: int = 1, + num_processes: Optional[int] = None, + max_seq_len: int = 256, + doc_stride: int = 128, + progress_bar: bool = True, + duplicate_filtering: int = 0, + use_confidence_scores: bool = True, + confidence_threshold: Optional[float] = None, + proxies: Optional[Dict[str, str]] = None, + local_files_only=False, + force_download=False, + use_auth_token: Optional[Union[str, bool]] = None) ``` **Arguments**: @@ -113,7 +134,29 @@ https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrai #### FARMReader.train ```python -def train(data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], batch_size: int = 10, n_epochs: int = 2, learning_rate: float = 1e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), grad_acc_steps: int = 1, early_stopping: Optional[EarlyStopping] = None) +def train(data_dir: str, + train_filename: str, + dev_filename: Optional[str] = None, + test_filename: Optional[str] = None, + use_gpu: Optional[bool] = None, + devices: List[torch.device] = [], + batch_size: int = 10, + n_epochs: int = 2, + learning_rate: float = 1e-5, + max_seq_len: Optional[int] = None, + warmup_proportion: float = 0.2, + dev_split: float = 0, + evaluate_every: int = 300, + save_dir: Optional[str] = None, + num_processes: Optional[int] = None, + use_amp: str = None, + checkpoint_root_dir: Path = Path("model_checkpoints"), + checkpoint_every: Optional[int] = None, + checkpoints_to_keep: int = 3, + caching: bool = False, + cache_path: Path = Path("cache/data_silo"), + grad_acc_steps: int = 1, + early_stopping: Optional[EarlyStopping] = None) ``` Fine-tune a model on a QA dataset. Options: @@ -176,7 +219,36 @@ None #### FARMReader.distil\_prediction\_layer\_from ```python -def distil_prediction_layer_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], student_batch_size: int = 10, teacher_batch_size: Optional[int] = None, n_epochs: int = 2, learning_rate: float = 3e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss_weight: float = 0.5, distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "kl_div", temperature: float = 1.0, grad_acc_steps: int = 1, early_stopping: Optional[EarlyStopping] = None) +def distil_prediction_layer_from( + teacher_model: "FARMReader", + data_dir: str, + train_filename: str, + dev_filename: Optional[str] = None, + test_filename: Optional[str] = None, + use_gpu: Optional[bool] = None, + devices: List[torch.device] = [], + student_batch_size: int = 10, + teacher_batch_size: Optional[int] = None, + n_epochs: int = 2, + learning_rate: float = 3e-5, + max_seq_len: Optional[int] = None, + warmup_proportion: float = 0.2, + dev_split: float = 0, + evaluate_every: int = 300, + save_dir: Optional[str] = None, + num_processes: Optional[int] = None, + use_amp: str = None, + checkpoint_root_dir: Path = Path("model_checkpoints"), + checkpoint_every: Optional[int] = None, + checkpoints_to_keep: int = 3, + caching: bool = False, + cache_path: Path = Path("cache/data_silo"), + distillation_loss_weight: float = 0.5, + distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], + torch.Tensor]] = "kl_div", + temperature: float = 1.0, + grad_acc_steps: int = 1, + early_stopping: Optional[EarlyStopping] = None) ``` Fine-tune a model on a QA dataset using logit-based distillation. You need to provide a teacher model that is already finetuned on the dataset @@ -258,7 +330,35 @@ None #### FARMReader.distil\_intermediate\_layers\_from ```python -def distil_intermediate_layers_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], batch_size: int = 10, n_epochs: int = 5, learning_rate: float = 5e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "mse", temperature: float = 1.0, processor: Optional[Processor] = None, grad_acc_steps: int = 1, early_stopping: Optional[EarlyStopping] = None) +def distil_intermediate_layers_from( + teacher_model: "FARMReader", + data_dir: str, + train_filename: str, + dev_filename: Optional[str] = None, + test_filename: Optional[str] = None, + use_gpu: Optional[bool] = None, + devices: List[torch.device] = [], + batch_size: int = 10, + n_epochs: int = 5, + learning_rate: float = 5e-5, + max_seq_len: Optional[int] = None, + warmup_proportion: float = 0.2, + dev_split: float = 0, + evaluate_every: int = 300, + save_dir: Optional[str] = None, + num_processes: Optional[int] = None, + use_amp: str = None, + checkpoint_root_dir: Path = Path("model_checkpoints"), + checkpoint_every: Optional[int] = None, + checkpoints_to_keep: int = 3, + caching: bool = False, + cache_path: Path = Path("cache/data_silo"), + distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], + torch.Tensor]] = "mse", + temperature: float = 1.0, + processor: Optional[Processor] = None, + grad_acc_steps: int = 1, + early_stopping: Optional[EarlyStopping] = None) ``` The first stage of distillation finetuning as described in the TinyBERT paper: @@ -332,7 +432,11 @@ None #### FARMReader.update\_parameters ```python -def update_parameters(context_window_size: Optional[int] = None, no_ans_boost: Optional[float] = None, return_no_answer: Optional[bool] = None, max_seq_len: Optional[int] = None, doc_stride: Optional[int] = None) +def update_parameters(context_window_size: Optional[int] = None, + no_ans_boost: Optional[float] = None, + return_no_answer: Optional[bool] = None, + max_seq_len: Optional[int] = None, + doc_stride: Optional[int] = None) ``` Hot update parameters of a loaded Reader. It may not to be safe when processing concurrent requests. @@ -356,7 +460,9 @@ Saves the Reader model so that it can be reused at a later point in time. #### FARMReader.save\_to\_remote ```python -def save_to_remote(repo_id: str, private: Optional[bool] = None, commit_message: str = "Add new model to Hugging Face.") +def save_to_remote(repo_id: str, + private: Optional[bool] = None, + commit_message: str = "Add new model to Hugging Face.") ``` Saves the Reader model to Hugging Face Model Hub with the given model_name. For this to work: @@ -375,7 +481,10 @@ Saves the Reader model to Hugging Face Model Hub with the given model_name. For #### FARMReader.predict\_batch ```python -def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None) +def predict_batch(queries: List[str], + documents: Union[List[Document], List[List[Document]]], + top_k: Optional[int] = None, + batch_size: Optional[int] = None) ``` Use loaded QA model to find answers for the queries in the Documents. @@ -405,7 +514,9 @@ Can be a single list of Documents or a list of lists of Documents. #### FARMReader.predict ```python -def predict(query: str, documents: List[Document], top_k: Optional[int] = None) +def predict(query: str, + documents: List[Document], + top_k: Optional[int] = None) ``` Use loaded QA model to find answers for a query in the supplied list of Document. @@ -442,7 +553,10 @@ Dict containing query and answers #### FARMReader.eval\_on\_file ```python -def eval_on_file(data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None, calibrate_conf_scores: bool = False) +def eval_on_file(data_dir: Union[Path, str], + test_filename: str, + device: Optional[Union[str, torch.device]] = None, + calibrate_conf_scores: bool = False) ``` Performs evaluation on a SQuAD-formatted file. @@ -466,7 +580,12 @@ or use the Reader's device by default. #### FARMReader.eval ```python -def eval(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False) +def eval(document_store: BaseDocumentStore, + device: Optional[Union[str, torch.device]] = None, + label_index: str = "label", + doc_index: str = "eval_document", + label_origin: str = "gold-label", + calibrate_conf_scores: bool = False) ``` Performs evaluation on evaluation documents in the DocumentStore. @@ -492,7 +611,12 @@ or use the Reader's device by default. #### FARMReader.calibrate\_confidence\_scores ```python -def calibrate_confidence_scores(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label") +def calibrate_confidence_scores(document_store: BaseDocumentStore, + device: Optional[Union[str, + torch.device]] = None, + label_index: str = "label", + doc_index: str = "eval_document", + label_origin: str = "gold_label") ``` Calibrates confidence scores on evaluation documents in the DocumentStore. @@ -512,7 +636,9 @@ or use the Reader's device by default. #### FARMReader.predict\_on\_texts ```python -def predict_on_texts(question: str, texts: List[str], top_k: Optional[int] = None) +def predict_on_texts(question: str, + texts: List[str], + top_k: Optional[int] = None) ``` Use loaded QA model to find answers for a question in the supplied list of Document. @@ -550,7 +676,13 @@ Dict containing question and answers ```python @classmethod -def convert_to_onnx(cls, model_name: str, output_path: Path, convert_to_float16: bool = False, quantize: bool = False, task_type: str = "question_answering", opset_version: int = 11) +def convert_to_onnx(cls, + model_name: str, + output_path: Path, + convert_to_float16: bool = False, + quantize: bool = False, + task_type: str = "question_answering", + opset_version: int = 11) ``` Convert a PyTorch BERT model to ONNX format and write to ./onnx-export dir. The converted ONNX model @@ -598,7 +730,20 @@ With this reader, you can directly get predictions via predict() #### TransformersReader.\_\_init\_\_ ```python -def __init__(model_name_or_path: str = "distilbert-base-uncased-distilled-squad", model_version: Optional[str] = None, tokenizer: Optional[str] = None, context_window_size: int = 70, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answers: bool = False, max_seq_len: int = 256, doc_stride: int = 128, batch_size: int = 16, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) +def __init__( + model_name_or_path: str = "distilbert-base-uncased-distilled-squad", + model_version: Optional[str] = None, + tokenizer: Optional[str] = None, + context_window_size: int = 70, + use_gpu: bool = True, + top_k: int = 10, + top_k_per_candidate: int = 3, + return_no_answers: bool = False, + max_seq_len: int = 256, + doc_stride: int = 128, + batch_size: int = 16, + use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None) ``` Load a QA model from Transformers. @@ -647,7 +792,9 @@ parameter is not used and a single cpu device is used for inference. #### TransformersReader.predict ```python -def predict(query: str, documents: List[Document], top_k: Optional[int] = None) +def predict(query: str, + documents: List[Document], + top_k: Optional[int] = None) ``` Use loaded QA model to find answers for a query in the supplied list of Document. @@ -685,7 +832,10 @@ Dict containing query and answers #### TransformersReader.predict\_batch ```python -def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None) +def predict_batch(queries: List[str], + documents: Union[List[Document], List[List[Document]]], + top_k: Optional[int] = None, + batch_size: Optional[int] = None) ``` Use loaded QA model to find answers for the queries in the Documents. @@ -752,7 +902,16 @@ answer = prediction["answers"][0].answer # "10 june 1996" #### TableReader.\_\_init\_\_ ```python -def __init__(model_name_or_path: str = "google/tapas-base-finetuned-wtq", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answer: bool = False, max_seq_len: int = 256, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) +def __init__(model_name_or_path: str = "google/tapas-base-finetuned-wtq", + model_version: Optional[str] = None, + tokenizer: Optional[str] = None, + use_gpu: bool = True, + top_k: int = 10, + top_k_per_candidate: int = 3, + return_no_answer: bool = False, + max_seq_len: int = 256, + use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None) ``` Load a TableQA model from Transformers. @@ -803,7 +962,9 @@ parameter is not used and a single cpu device is used for inference. #### TableReader.predict ```python -def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict +def predict(query: str, + documents: List[Document], + top_k: Optional[int] = None) -> Dict ``` Use loaded TableQA model to find answers for a query in the supplied list of Documents @@ -830,7 +991,10 @@ Dict containing query and answers #### TableReader.predict\_batch ```python -def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None) +def predict_batch(queries: List[str], + documents: Union[List[Document], List[List[Document]]], + top_k: Optional[int] = None, + batch_size: Optional[int] = None) ``` Use loaded TableQA model to find answers for the supplied queries in the supplied Documents @@ -890,7 +1054,18 @@ Pros and Cons of RCIReader compared to TableReader: #### RCIReader.\_\_init\_\_ ```python -def __init__(row_model_name_or_path: str = "michaelrglass/albert-base-rci-wikisql-row", column_model_name_or_path: str = "michaelrglass/albert-base-rci-wikisql-col", row_model_version: Optional[str] = None, column_model_version: Optional[str] = None, row_tokenizer: Optional[str] = None, column_tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, max_seq_len: int = 256, use_auth_token: Optional[Union[str, bool]] = None) +def __init__(row_model_name_or_path: + str = "michaelrglass/albert-base-rci-wikisql-row", + column_model_name_or_path: + str = "michaelrglass/albert-base-rci-wikisql-col", + row_model_version: Optional[str] = None, + column_model_version: Optional[str] = None, + row_tokenizer: Optional[str] = None, + column_tokenizer: Optional[str] = None, + use_gpu: bool = True, + top_k: int = 10, + max_seq_len: int = 256, + use_auth_token: Optional[Union[str, bool]] = None) ``` Load an RCI model from Transformers. @@ -926,7 +1101,9 @@ https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrai #### RCIReader.predict ```python -def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict +def predict(query: str, + documents: List[Document], + top_k: Optional[int] = None) -> Dict ``` Use loaded RCI models to find answers for a query in the supplied list of Documents diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index 318e35940..ad6078caa 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -28,7 +28,13 @@ Base class for regular retrievers. ```python @abstractmethod -def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] +def retrieve(query: str, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + top_k: Optional[int] = None, + index: str = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -61,7 +67,13 @@ Wrapper method used to time functions. #### BaseRetriever.eval ```python -def eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False, headers: Optional[Dict[str, str]] = None) -> dict +def eval(label_index: str = "label", + doc_index: str = "eval_document", + label_origin: str = "gold-label", + top_k: int = 10, + open_domain: bool = False, + return_preds: bool = False, + headers: Optional[Dict[str, str]] = None) -> dict ``` Performs evaluation on the Retriever. @@ -110,7 +122,11 @@ class BM25Retriever(BaseRetriever) #### BM25Retriever.\_\_init\_\_ ```python -def __init__(document_store: KeywordDocumentStore, top_k: int = 10, all_terms_must_match: bool = False, custom_query: Optional[str] = None, scale_score: bool = True) +def __init__(document_store: KeywordDocumentStore, + top_k: int = 10, + all_terms_must_match: bool = False, + custom_query: Optional[str] = None, + scale_score: bool = True) ``` **Arguments**: @@ -194,7 +210,13 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### BM25Retriever.retrieve ```python -def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] +def retrieve(query: str, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + top_k: Optional[int] = None, + index: str = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -280,12 +302,18 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### BM25Retriever.retrieve\_batch ```python -def retrieve_batch(queries: List[str], filters: Optional[ - Union[ - Dict[str, Union[Dict, List, str, int, float, bool]], - List[Dict[str, Union[Dict, List, str, int, float, bool]]], - ] - ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] +def retrieve_batch(queries: List[str], + filters: Optional[Union[Dict[str, Union[Dict, List, str, + int, float, bool]], + List[Dict[str, + Union[Dict, List, str, + int, float, + bool]]], ]] = None, + top_k: Optional[int] = None, + index: str = None, + headers: Optional[Dict[str, str]] = None, + batch_size: Optional[int] = None, + scale_score: bool = None) -> List[List[Document]] ``` Scan through documents in DocumentStore and return a small number documents @@ -386,7 +414,12 @@ Helpful for benchmarking, testing and if you want to do QA on small documents wi #### FilterRetriever.retrieve ```python -def retrieve(query: str, filters: dict = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] +def retrieve(query: str, + filters: dict = None, + top_k: Optional[int] = None, + index: str = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -425,7 +458,9 @@ It uses sklearn's TfidfVectorizer to compute a tf-idf matrix. #### TfidfRetriever.\_\_init\_\_ ```python -def __init__(document_store: BaseDocumentStore, top_k: int = 10, auto_fit=True) +def __init__(document_store: BaseDocumentStore, + top_k: int = 10, + auto_fit=True) ``` **Arguments**: @@ -439,12 +474,16 @@ def __init__(document_store: BaseDocumentStore, top_k: int = 10, auto_fit=True) #### TfidfRetriever.retrieve ```python -def retrieve(query: str, filters: Optional[ - Union[ - Dict[str, Union[Dict, List, str, int, float, bool]], - List[Dict[str, Union[Dict, List, str, int, float, bool]]], - ] - ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] +def retrieve(query: str, + filters: Optional[Union[Dict[str, Union[Dict, List, str, int, + float, bool]], + List[Dict[str, + Union[Dict, List, str, int, + float, bool]]], ]] = None, + top_k: Optional[int] = None, + index: str = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -466,7 +505,14 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### TfidfRetriever.retrieve\_batch ```python -def retrieve_batch(queries: Union[str, List[str]], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] +def retrieve_batch(queries: Union[str, List[str]], + filters: Optional[Dict[str, Union[Dict, List, str, int, + float, bool]]] = None, + top_k: Optional[int] = None, + index: str = None, + headers: Optional[Dict[str, str]] = None, + batch_size: Optional[int] = None, + scale_score: bool = None) -> List[List[Document]] ``` Scan through documents in DocumentStore and return a small number documents @@ -519,7 +565,25 @@ Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Que #### DensePassageRetriever.\_\_init\_\_ ```python -def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True) +def __init__(document_store: BaseDocumentStore, + query_embedding_model: Union[ + Path, str] = "facebook/dpr-question_encoder-single-nq-base", + passage_embedding_model: Union[ + Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", + model_version: Optional[str] = None, + max_seq_len_query: int = 64, + max_seq_len_passage: int = 256, + top_k: int = 10, + use_gpu: bool = True, + batch_size: int = 16, + embed_title: bool = True, + use_fast_tokenizers: bool = True, + similarity_function: str = "dot_product", + global_loss_buffer_size: int = 150000, + progress_bar: bool = True, + devices: Optional[List[Union[str, torch.device]]] = None, + use_auth_token: Optional[Union[str, bool]] = None, + scale_score: bool = True) ``` Init the Retriever incl. the two encoder models from a local or remote model checkpoint. @@ -587,7 +651,13 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### DensePassageRetriever.retrieve ```python -def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] +def retrieve(query: str, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + top_k: Optional[int] = None, + index: str = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -671,12 +741,18 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### DensePassageRetriever.retrieve\_batch ```python -def retrieve_batch(queries: List[str], filters: Optional[ - Union[ - Dict[str, Union[Dict, List, str, int, float, bool]], - List[Dict[str, Union[Dict, List, str, int, float, bool]]], - ] - ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] +def retrieve_batch(queries: List[str], + filters: Optional[Union[Dict[str, Union[Dict, List, str, + int, float, bool]], + List[Dict[str, + Union[Dict, List, str, + int, float, + bool]]], ]] = None, + top_k: Optional[int] = None, + index: str = None, + headers: Optional[Dict[str, str]] = None, + batch_size: Optional[int] = None, + scale_score: bool = None) -> List[List[Document]] ``` Scan through documents in DocumentStore and return a small number documents @@ -802,7 +878,36 @@ Embeddings of documents / passages shape (batch_size, embedding_dim) #### DensePassageRetriever.train ```python -def train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_samples: int = None, max_processes: int = 128, multiprocessing_strategy: Optional[str] = None, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "AdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder", checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, early_stopping: Optional[EarlyStopping] = None) +def train(data_dir: str, + train_filename: str, + dev_filename: str = None, + test_filename: str = None, + max_samples: int = None, + max_processes: int = 128, + multiprocessing_strategy: Optional[str] = None, + dev_split: float = 0, + batch_size: int = 2, + embed_title: bool = True, + num_hard_negatives: int = 1, + num_positives: int = 1, + n_epochs: int = 3, + evaluate_every: int = 1000, + n_gpu: int = 1, + learning_rate: float = 1e-5, + epsilon: float = 1e-08, + weight_decay: float = 0.0, + num_warmup_steps: int = 100, + grad_acc_steps: int = 1, + use_amp: str = None, + optimizer_name: str = "AdamW", + optimizer_correct_bias: bool = True, + save_dir: str = "../saved_models/dpr", + query_encoder_save_dir: str = "query_encoder", + passage_encoder_save_dir: str = "passage_encoder", + checkpoint_root_dir: Path = Path("model_checkpoints"), + checkpoint_every: Optional[int] = None, + checkpoints_to_keep: int = 3, + early_stopping: Optional[EarlyStopping] = None) ``` train a DensePassageRetrieval model @@ -856,7 +961,9 @@ If any checkpoints are stored, a subsequent run of train() will resume training #### DensePassageRetriever.save ```python -def save(save_dir: Union[Path, str], query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder") +def save(save_dir: Union[Path, str], + query_encoder_dir: str = "query_encoder", + passage_encoder_dir: str = "passage_encoder") ``` Save DensePassageRetriever to the specified directory. @@ -877,7 +984,18 @@ None ```python @classmethod -def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder") +def load(cls, + load_dir: Union[Path, str], + document_store: BaseDocumentStore, + max_seq_len_query: int = 64, + max_seq_len_passage: int = 256, + use_gpu: bool = True, + batch_size: int = 16, + embed_title: bool = True, + use_fast_tokenizers: bool = True, + similarity_function: str = "dot_product", + query_encoder_dir: str = "query_encoder", + passage_encoder_dir: str = "passage_encoder") ``` Load DensePassageRetriever from the specified directory. @@ -901,7 +1019,30 @@ Kostić, Bogdan, et al. (2021): "Multi-modal Retrieval of Tables and Texts Using #### TableTextRetriever.\_\_init\_\_ ```python -def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, use_fast: bool = True) +def __init__( + document_store: BaseDocumentStore, + query_embedding_model: Union[ + Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", + passage_embedding_model: Union[ + Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", + table_embedding_model: Union[ + Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", + model_version: Optional[str] = None, + max_seq_len_query: int = 64, + max_seq_len_passage: int = 256, + max_seq_len_table: int = 256, + top_k: int = 10, + use_gpu: bool = True, + batch_size: int = 16, + embed_meta_fields: List[str] = ["name", "section_title", "caption"], + use_fast_tokenizers: bool = True, + similarity_function: str = "dot_product", + global_loss_buffer_size: int = 150000, + progress_bar: bool = True, + devices: Optional[List[Union[str, torch.device]]] = None, + use_auth_token: Optional[Union[str, bool]] = None, + scale_score: bool = True, + use_fast: bool = True) ``` Init the Retriever incl. the two encoder models from a local or remote model checkpoint. @@ -956,12 +1097,18 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### TableTextRetriever.retrieve\_batch ```python -def retrieve_batch(queries: List[str], filters: Optional[ - Union[ - Dict[str, Union[Dict, List, str, int, float, bool]], - List[Dict[str, Union[Dict, List, str, int, float, bool]]], - ] - ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] +def retrieve_batch(queries: List[str], + filters: Optional[Union[Dict[str, Union[Dict, List, str, + int, float, bool]], + List[Dict[str, + Union[Dict, List, str, + int, float, + bool]]], ]] = None, + top_k: Optional[int] = None, + index: str = None, + headers: Optional[Dict[str, str]] = None, + batch_size: Optional[int] = None, + scale_score: bool = None) -> List[List[Document]] ``` Scan through documents in DocumentStore and return a small number documents @@ -1090,7 +1237,38 @@ Embeddings of documents / passages. Shape: (batch_size, embedding_dim) #### TableTextRetriever.train ```python -def train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_samples: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_meta_fields: List[str] = ["page_title", "section_title", "caption"], num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "AdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/mm_retrieval", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder", table_encoder_save_dir: str = "table_encoder", checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, early_stopping: Optional[EarlyStopping] = None) +def train(data_dir: str, + train_filename: str, + dev_filename: str = None, + test_filename: str = None, + max_samples: int = None, + max_processes: int = 128, + dev_split: float = 0, + batch_size: int = 2, + embed_meta_fields: List[str] = [ + "page_title", "section_title", "caption" + ], + num_hard_negatives: int = 1, + num_positives: int = 1, + n_epochs: int = 3, + evaluate_every: int = 1000, + n_gpu: int = 1, + learning_rate: float = 1e-5, + epsilon: float = 1e-08, + weight_decay: float = 0.0, + num_warmup_steps: int = 100, + grad_acc_steps: int = 1, + use_amp: str = None, + optimizer_name: str = "AdamW", + optimizer_correct_bias: bool = True, + save_dir: str = "../saved_models/mm_retrieval", + query_encoder_save_dir: str = "query_encoder", + passage_encoder_save_dir: str = "passage_encoder", + table_encoder_save_dir: str = "table_encoder", + checkpoint_root_dir: Path = Path("model_checkpoints"), + checkpoint_every: Optional[int] = None, + checkpoints_to_keep: int = 3, + early_stopping: Optional[EarlyStopping] = None) ``` Train a TableTextRetrieval model. @@ -1144,7 +1322,10 @@ checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is cr #### TableTextRetriever.save ```python -def save(save_dir: Union[Path, str], query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder") +def save(save_dir: Union[Path, str], + query_encoder_dir: str = "query_encoder", + passage_encoder_dir: str = "passage_encoder", + table_encoder_dir: str = "table_encoder") ``` Save TableTextRetriever to the specified directory. @@ -1166,7 +1347,20 @@ None ```python @classmethod -def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder") +def load(cls, + load_dir: Union[Path, str], + document_store: BaseDocumentStore, + max_seq_len_query: int = 64, + max_seq_len_passage: int = 256, + max_seq_len_table: int = 256, + use_gpu: bool = True, + batch_size: int = 16, + embed_meta_fields: List[str] = ["name", "section_title", "caption"], + use_fast_tokenizers: bool = True, + similarity_function: str = "dot_product", + query_encoder_dir: str = "query_encoder", + passage_encoder_dir: str = "passage_encoder", + table_encoder_dir: str = "table_encoder") ``` Load TableTextRetriever from the specified directory. @@ -1184,7 +1378,21 @@ class EmbeddingRetriever(BaseRetriever) #### EmbeddingRetriever.\_\_init\_\_ ```python -def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: Optional[str] = None, pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, embed_meta_fields: List[str] = []) +def __init__(document_store: BaseDocumentStore, + embedding_model: str, + model_version: Optional[str] = None, + use_gpu: bool = True, + batch_size: int = 32, + max_seq_len: int = 512, + model_format: Optional[str] = None, + pooling_strategy: str = "reduce_mean", + emb_extraction_layer: int = -1, + top_k: int = 10, + progress_bar: bool = True, + devices: Optional[List[Union[str, torch.device]]] = None, + use_auth_token: Optional[Union[str, bool]] = None, + scale_score: bool = True, + embed_meta_fields: List[str] = []) ``` **Arguments**: @@ -1239,7 +1447,13 @@ performance if your titles contain meaningful information for retrieval #### EmbeddingRetriever.retrieve ```python -def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] +def retrieve(query: str, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + top_k: Optional[int] = None, + index: str = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -1323,12 +1537,18 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### EmbeddingRetriever.retrieve\_batch ```python -def retrieve_batch(queries: List[str], filters: Optional[ - Union[ - Dict[str, Union[Dict, List, str, int, float, bool]], - List[Dict[str, Union[Dict, List, str, int, float, bool]]], - ] - ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] +def retrieve_batch(queries: List[str], + filters: Optional[Union[Dict[str, Union[Dict, List, str, + int, float, bool]], + List[Dict[str, + Union[Dict, List, str, + int, float, + bool]]], ]] = None, + top_k: Optional[int] = None, + index: str = None, + headers: Optional[Dict[str, str]] = None, + batch_size: Optional[int] = None, + scale_score: bool = None) -> List[List[Document]] ``` Scan through documents in DocumentStore and return a small number documents @@ -1454,7 +1674,11 @@ Embeddings, one per input document #### EmbeddingRetriever.train ```python -def train(training_data: List[Dict[str, Any]], learning_rate: float = 2e-5, n_epochs: int = 1, num_warmup_steps: int = None, batch_size: int = 16) -> None +def train(training_data: List[Dict[str, Any]], + learning_rate: float = 2e-5, + n_epochs: int = 1, + num_warmup_steps: int = None, + batch_size: int = 16) -> None ``` Trains/adapts the underlying embedding model. @@ -1507,7 +1731,22 @@ Xiong, Wenhan, et. al. (2020): "Answering complex open-domain questions with mul #### MultihopEmbeddingRetriever.\_\_init\_\_ ```python -def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, num_iterations: int = 2, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: str = "farm", pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, embed_meta_fields: List[str] = []) +def __init__(document_store: BaseDocumentStore, + embedding_model: str, + model_version: Optional[str] = None, + num_iterations: int = 2, + use_gpu: bool = True, + batch_size: int = 32, + max_seq_len: int = 512, + model_format: str = "farm", + pooling_strategy: str = "reduce_mean", + emb_extraction_layer: int = -1, + top_k: int = 10, + progress_bar: bool = True, + devices: Optional[List[Union[str, torch.device]]] = None, + use_auth_token: Optional[Union[str, bool]] = None, + scale_score: bool = True, + embed_meta_fields: List[str] = []) ``` **Arguments**: @@ -1563,7 +1802,13 @@ performance if your titles contain meaningful information for retrieval #### MultihopEmbeddingRetriever.retrieve ```python -def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] +def retrieve(query: str, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, + bool]]] = None, + top_k: Optional[int] = None, + index: str = None, + headers: Optional[Dict[str, str]] = None, + scale_score: bool = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -1647,12 +1892,18 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### MultihopEmbeddingRetriever.retrieve\_batch ```python -def retrieve_batch(queries: List[str], filters: Optional[ - Union[ - Dict[str, Union[Dict, List, str, int, float, bool]], - List[Dict[str, Union[Dict, List, str, int, float, bool]]], - ] - ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] +def retrieve_batch(queries: List[str], + filters: Optional[Union[Dict[str, Union[Dict, List, str, + int, float, bool]], + List[Dict[str, + Union[Dict, List, str, + int, float, + bool]]], ]] = None, + top_k: Optional[int] = None, + index: str = None, + headers: Optional[Dict[str, str]] = None, + batch_size: Optional[int] = None, + scale_score: bool = None) -> List[List[Document]] ``` Scan through documents in DocumentStore and return a small number documents @@ -1759,7 +2010,10 @@ The generated SPARQL query is executed on a knowledge graph. #### Text2SparqlRetriever.\_\_init\_\_ ```python -def __init__(knowledge_graph, model_name_or_path, top_k: int = 1, use_auth_token: Optional[Union[str, bool]] = None) +def __init__(knowledge_graph, + model_name_or_path, + top_k: int = 1, + use_auth_token: Optional[Union[str, bool]] = None) ``` Init the Retriever by providing a knowledge graph and a pre-trained BART model diff --git a/docs/_src/api/api/summarizer.md b/docs/_src/api/api/summarizer.md index d76878f78..ee6c38804 100644 --- a/docs/_src/api/api/summarizer.md +++ b/docs/_src/api/api/summarizer.md @@ -18,7 +18,8 @@ Abstract class for Summarizer ```python @abstractmethod -def predict(documents: List[Document], generate_single_summary: Optional[bool] = None) -> List[Document] +def predict(documents: List[Document], + generate_single_summary: Optional[bool] = None) -> List[Document] ``` Abstract method for creating a summary. @@ -87,7 +88,19 @@ See the up-to-date list of available models on #### TransformersSummarizer.\_\_init\_\_ ```python -def __init__(model_name_or_path: str = "google/pegasus-xsum", model_version: Optional[str] = None, tokenizer: Optional[str] = None, max_length: int = 200, min_length: int = 5, use_gpu: bool = True, clean_up_tokenization_spaces: bool = True, separator_for_single_summary: str = " ", generate_single_summary: bool = False, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) +def __init__(model_name_or_path: str = "google/pegasus-xsum", + model_version: Optional[str] = None, + tokenizer: Optional[str] = None, + max_length: int = 200, + min_length: int = 5, + use_gpu: bool = True, + clean_up_tokenization_spaces: bool = True, + separator_for_single_summary: str = " ", + generate_single_summary: bool = False, + batch_size: int = 16, + progress_bar: bool = True, + use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None) ``` Load a Summarization model from Transformers. @@ -129,7 +142,8 @@ parameter is not used and a single cpu device is used for inference. #### TransformersSummarizer.predict ```python -def predict(documents: List[Document], generate_single_summary: Optional[bool] = None) -> List[Document] +def predict(documents: List[Document], + generate_single_summary: Optional[bool] = None) -> List[Document] ``` Produce the summarization from the supplied documents. @@ -154,7 +168,11 @@ the original, not summarized text #### TransformersSummarizer.predict\_batch ```python -def predict_batch(documents: Union[List[Document], List[List[Document]]], generate_single_summary: Optional[bool] = None, batch_size: Optional[int] = None) -> Union[List[Document], List[List[Document]]] +def predict_batch( + documents: Union[List[Document], List[List[Document]]], + generate_single_summary: Optional[bool] = None, + batch_size: Optional[int] = None +) -> Union[List[Document], List[List[Document]]] ``` Produce the summarization from the supplied documents. diff --git a/docs/_src/api/api/translator.md b/docs/_src/api/api/translator.md index 8f2ddc66a..50ab26fd7 100644 --- a/docs/_src/api/api/translator.md +++ b/docs/_src/api/api/translator.md @@ -18,7 +18,13 @@ Abstract class for a Translator component that translates either a query or a do ```python @abstractmethod -def translate(results: List[Dict[str, Any]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]] +def translate( + results: List[Dict[str, Any]] = None, + query: Optional[str] = None, + documents: Optional[Union[List[Document], List[Answer], List[str], + List[Dict[str, Any]]]] = None, + dict_key: Optional[str] = None +) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]] ``` Translate the passed query or a list of documents from language A to B. @@ -28,7 +34,12 @@ Translate the passed query or a list of documents from language A to B. #### BaseTranslator.run ```python -def run(results: List[Dict[str, Any]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, answers: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) +def run(results: List[Dict[str, Any]] = None, + query: Optional[str] = None, + documents: Optional[Union[List[Document], List[Answer], List[str], + List[Dict[str, Any]]]] = None, + answers: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, + dict_key: Optional[str] = None) ``` Method that gets executed when this class is used as a Node in a Haystack Pipeline @@ -68,7 +79,14 @@ We currently recommend using OPUS models (see __init__() for details) #### TransformersTranslator.\_\_init\_\_ ```python -def __init__(model_name_or_path: str, tokenizer_name: Optional[str] = None, max_seq_len: Optional[int] = None, clean_up_tokenization_spaces: Optional[bool] = True, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) +def __init__(model_name_or_path: str, + tokenizer_name: Optional[str] = None, + max_seq_len: Optional[int] = None, + clean_up_tokenization_spaces: Optional[bool] = True, + use_gpu: bool = True, + progress_bar: bool = True, + use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None) ``` Initialize the translator with a model that fits your targeted languages. While we support all seq2seq @@ -109,7 +127,13 @@ parameter is not used and a single cpu device is used for inference. #### TransformersTranslator.translate ```python -def translate(results: Optional[List[Dict[str, Any]]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]] +def translate( + results: Optional[List[Dict[str, Any]]] = None, + query: Optional[str] = None, + documents: Optional[Union[List[Document], List[Answer], List[str], + List[Dict[str, Any]]]] = None, + dict_key: Optional[str] = None +) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]] ``` Run the actual translation. You can supply a query or a list of documents. Whatever is supplied will be translated. @@ -126,7 +150,14 @@ Run the actual translation. You can supply a query or a list of documents. Whate #### TransformersTranslator.translate\_batch ```python -def translate_batch(queries: Optional[List[str]] = None, documents: Optional[Union[List[Document], List[Answer], List[List[Document]], List[List[Answer]]]] = None, batch_size: Optional[int] = None) -> List[Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]]] +def translate_batch( + queries: Optional[List[str]] = None, + documents: Optional[Union[List[Document], List[Answer], + List[List[Document]], + List[List[Answer]]]] = None, + batch_size: Optional[int] = None +) -> List[Union[str, List[Document], List[Answer], List[str], List[Dict[ + str, Any]]]] ``` Run the actual translation. You can supply a single query, a list of queries or a list (of lists) of documents. diff --git a/docs/_src/api/api/utils.md b/docs/_src/api/api/utils.md index 58c12700a..6e7838238 100644 --- a/docs/_src/api/api/utils.md +++ b/docs/_src/api/api/utils.md @@ -7,7 +7,9 @@ #### print\_answers ```python -def print_answers(results: dict, details: str = "all", max_text_len: Optional[int] = None) +def print_answers(results: dict, + details: str = "all", + max_text_len: Optional[int] = None) ``` Utility function to print results of Haystack pipelines @@ -27,7 +29,10 @@ None #### print\_documents ```python -def print_documents(results: dict, max_text_len: Optional[int] = None, print_name: bool = True, print_meta: bool = False) +def print_documents(results: dict, + max_text_len: Optional[int] = None, + print_name: bool = True, + print_meta: bool = False) ``` Utility that prints a compressed representation of the documents returned by a pipeline. @@ -90,7 +95,12 @@ Convert the export from the labeling UI to the SQuAD format for training. #### convert\_files\_to\_docs ```python -def convert_files_to_docs(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert_files_to_docs( + dir_path: str, + clean_func: Optional[Callable] = None, + split_paragraphs: bool = False, + encoding: Optional[str] = None, + id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Documents that can be written to a @@ -114,7 +124,13 @@ If you do this, the Document ID will be generated by using the content and the d #### tika\_convert\_files\_to\_docs ```python -def tika_convert_files_to_docs(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False, merge_short: bool = True, merge_lowercase: bool = True, id_hash_keys: Optional[List[str]] = None) -> List[Document] +def tika_convert_files_to_docs( + dir_path: str, + clean_func: Optional[Callable] = None, + split_paragraphs: bool = False, + merge_short: bool = True, + merge_lowercase: bool = True, + id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Convert all files (.txt, .pdf) in the sub-directories of the given path to Documents that can be written to a @@ -320,7 +336,13 @@ EarlyStopping class instead as long as it implements the method `check_stopping( #### EarlyStopping.\_\_init\_\_ ```python -def __init__(head: int = 0, metric: Union[str, Callable] = "loss", save_dir: Optional[str] = None, mode: Literal["min", "max"] = "min", patience: int = 0, min_delta: float = 0.001, min_evals: int = 0) +def __init__(head: int = 0, + metric: Union[str, Callable] = "loss", + save_dir: Optional[str] = None, + mode: Literal["min", "max"] = "min", + patience: int = 0, + min_delta: float = 0.001, + min_evals: int = 0) ``` **Arguments**: diff --git a/haystack/nodes/file_converter/tika.py b/haystack/nodes/file_converter/tika.py index 4e8272141..1c8514efe 100644 --- a/haystack/nodes/file_converter/tika.py +++ b/haystack/nodes/file_converter/tika.py @@ -29,7 +29,7 @@ def launch_tika(sleep=15, delete_existing=False): _ = subprocess.run([f"docker rm --force {TIKA_CONTAINER_NAME}"], shell=True, stdout=subprocess.DEVNULL) status = subprocess.run( [ - f"docker start {TIKA_CONTAINER_NAME} > /dev/null 2>&1 || docker run -p 9998:9998 --name {TIKA_CONTAINER_NAME} apache/tika:1.24.1" + f"docker start {TIKA_CONTAINER_NAME} > /dev/null 2>&1 || docker run -p 9998:9998 --name {TIKA_CONTAINER_NAME} apache/tika:1.28.4" ], shell=True, ) @@ -102,7 +102,7 @@ class TikaConverter(BaseConverter): if ping.status_code != 200: raise Exception( f"Apache Tika server is not reachable at the URL '{tika_url}'. To run it locally" - f"with Docker, execute: 'docker run -p 9998:9998 apache/tika:1.24.1'" + f"with Docker, execute: 'docker run -p 9998:9998 apache/tika:1.28.4'" ) self.tika_url = tika_url super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) diff --git a/haystack/utils/doc_store.py b/haystack/utils/doc_store.py index 306ef46fa..208036602 100644 --- a/haystack/utils/doc_store.py +++ b/haystack/utils/doc_store.py @@ -46,7 +46,7 @@ def launch_opensearch(sleep=15, delete_existing=False): _ = subprocess.run([f"docker rm --force {OPENSEARCH_CONTAINER_NAME}"], shell=True, stdout=subprocess.DEVNULL) status = subprocess.run( [ - f'docker start {OPENSEARCH_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" --name {OPENSEARCH_CONTAINER_NAME} opensearchproject/opensearch:1.2.4' + f'docker start {OPENSEARCH_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" --name {OPENSEARCH_CONTAINER_NAME} opensearchproject/opensearch:1.3.5' ], shell=True, ) @@ -65,7 +65,7 @@ def launch_weaviate(sleep=15): logger.debug("Starting Weaviate ...") status = subprocess.run( [ - f"docker start {WEAVIATE_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --name {WEAVIATE_CONTAINER_NAME} semitechnologies/weaviate:1.11.0" + f"docker start {WEAVIATE_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --name {WEAVIATE_CONTAINER_NAME} semitechnologies/weaviate:1.14.0" ], shell=True, ) diff --git a/pyproject.toml b/pyproject.toml index 105fcb04a..0420497fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,9 +62,9 @@ dependencies = [ "mmh3", # fast hashing function (murmurhash3) "quantulum3", # quantities extraction from text "posthog", # telemetry - "azure-ai-formrecognizer==3.2.0b2", # forms reader + "azure-ai-formrecognizer>=3.2.0b2", # forms reader # audio's espnet-model-zoo requires huggingface-hub version <0.8 while we need >=0.5 to be able to use create_repo in FARMReader - "huggingface-hub<0.8.0,>=0.5.0", + "huggingface-hub>=0.5.0", # Preprocessing "more_itertools", # for windowing @@ -168,9 +168,9 @@ preprocessing = [ "python-magic-bin; platform_system == 'Windows'", # Needs to be installed without python-magic, otherwise Windows CI gets stuck. ] ocr = [ - "pytesseract==0.3.7", + "pytesseract>0.3.7", "pillow", - "pdf2image==1.14.0", + "pdf2image>1.14", ] onnx = [ "onnxruntime", @@ -207,14 +207,7 @@ dev = [ # Code formatting "black[jupyter]==22.6.0", # Documentation - "pydoc-markdown==4.5.1", # FIXME Unpin! - # azure-core is a dependency of azure-ai-formrecognizer - # In order to stop malicious pip backtracking during pip install farm-haystack[all] documented in https://github.com/deepset-ai/haystack/issues/2280 - # we have to resolve a dependency version conflict ourself. - # azure-core>=1.23 conflicts with pydoc-markdown's dependency on databind>=1.5.0 which itself requires typing-extensions<4.0.0 - # azure-core>=1.23 needs typing-extensions>=4.0.1 - # pip unfortunately backtracks into the databind direction ultimately getting lost. - "azure-core<1.23", + "pydoc-markdown", "mkdocs", "jupytercontrib", "watchdog", # ==1.0.2 diff --git a/test/conftest.py b/test/conftest.py index 9524d4223..6b346dbb9 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -423,7 +423,7 @@ def weaviate_fixture(): print("Starting Weaviate servers ...") status = subprocess.run(["docker rm haystack_test_weaviate"], shell=True) status = subprocess.run( - ["docker run -d --name haystack_test_weaviate -p 8080:8080 semitechnologies/weaviate:1.11.0"], shell=True + ["docker run -d --name haystack_test_weaviate -p 8080:8080 semitechnologies/weaviate:1.14.1"], shell=True ) if status.returncode: raise Exception("Failed to launch Weaviate. Please check docker container logs.") @@ -460,7 +460,7 @@ def tika_fixture(): raise Exception("Unable to connect Tika. Please check tika endpoint {0}.".format(tika_url)) except: print("Starting Tika ...") - status = subprocess.run(["docker run -d --name tika -p 9998:9998 apache/tika:1.24.1"], shell=True) + status = subprocess.run(["docker run -d --name tika -p 9998:9998 apache/tika:1.28.4"], shell=True) if status.returncode: raise Exception("Failed to launch Tika. Please check docker container logs.") time.sleep(30)