diff --git a/docs/_src/api/api/document_classifier.md b/docs/_src/api/api/document_classifier.md index 2f4ae07b6..f0fbe664c 100644 --- a/docs/_src/api/api/document_classifier.md +++ b/docs/_src/api/api/document_classifier.md @@ -47,7 +47,7 @@ With this document_classifier, you can directly get predictions via predict() **Usage example at query time:** ```python | ... -| retriever = ElasticsearchRetriever(document_store=document_store) +| retriever = BM25Retriever(document_store=document_store) | document_classifier = TransformersDocumentClassifier(model_name_or_path="bhadresh-savani/distilbert-base-uncased-emotion") | p = Pipeline() | p.add_node(component=retriever, name="Retriever", inputs=["Query"]) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 337d0bc97..5be40e667 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -434,7 +434,7 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea - `aws4auth`: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package) - `index`: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one. - `label_index`: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one. -- `search_fields`: Name of fields used by ElasticsearchRetriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"] +- `search_fields`: Name of fields used by BM25Retriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"] - `content_field`: Name of field that might contain the answer and will therefore be passed to the Reader Model (e.g. "full_text"). If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will just be returned. - `name_field`: Name of field that contains the title of the the doc @@ -1250,7 +1250,7 @@ the KNN plugin that can scale to a large number of documents. - `aws4auth`: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package) - `index`: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one. - `label_index`: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one. -- `search_fields`: Name of fields used by ElasticsearchRetriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"] +- `search_fields`: Name of fields used by BM25Retriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"] - `content_field`: Name of field that might contain the answer and will therefore be passed to the Reader Model (e.g. "full_text"). If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will just be returned. - `name_field`: Name of field that contains the title of the the doc diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md index 8d005ec4b..59dff3f2d 100644 --- a/docs/_src/api/api/pipelines.md +++ b/docs/_src/api/api/pipelines.md @@ -104,7 +104,7 @@ Here's a sample configuration: | }, | { | "name": "MyESRetriever", - | "type": "ElasticsearchRetriever", + | "type": "BM25Retriever", | "params": { | "document_store": "MyDocumentStore", # params can reference other components defined in the YAML | "custom_query": None, @@ -161,7 +161,7 @@ Here's a sample configuration: | no_ans_boost: -10 | model_name_or_path: deepset/roberta-base-squad2 | - name: MyESRetriever - | type: ElasticsearchRetriever + | type: BM25Retriever | params: | document_store: MyDocumentStore # params can reference other components defined in the YAML | custom_query: null @@ -374,8 +374,8 @@ Add a new node to the pipeline. method to process incoming data from predecessor node. - `name`: The name for the node. It must not contain any dots. - `inputs`: A list of inputs to the node. If the predecessor node has a single outgoing edge, just the name -of node is sufficient. For instance, a 'ElasticsearchRetriever' node would always output a single -edge with a list of documents. It can be represented as ["ElasticsearchRetriever"]. +of node is sufficient. For instance, a 'BM25Retriever' node would always output a single +edge with a list of documents. It can be represented as ["BM25Retriever"]. In cases when the predecessor node has multiple outputs, e.g., a "QueryClassifier", the output must be specified explicitly as "QueryClassifier.output_2". @@ -672,7 +672,7 @@ Here's a sample configuration: | no_ans_boost: -10 | model_name_or_path: deepset/roberta-base-squad2 | - name: MyESRetriever - | type: ElasticsearchRetriever + | type: BM25Retriever | params: | document_store: MyDocumentStore # params can reference other components defined in the YAML | custom_query: null @@ -730,7 +730,7 @@ Here's a sample configuration: | }, | { | "name": "MyESRetriever", - | "type": "ElasticsearchRetriever", + | "type": "BM25Retriever", | "params": { | "document_store": "MyDocumentStore", # params can reference other components defined in the YAML | "custom_query": None, @@ -882,7 +882,7 @@ Here's a sample configuration: | no_ans_boost: -10 | model_name_or_path: deepset/roberta-base-squad2 | - name: MyESRetriever - | type: ElasticsearchRetriever + | type: BM25Retriever | params: | document_store: MyDocumentStore # params can reference other components defined in the YAML | custom_query: null @@ -1017,8 +1017,8 @@ Add a new node to the pipeline. method to process incoming data from predecessor node. - `name`: The name for the node. It must not contain any dots. - `inputs`: A list of inputs to the node. If the predecessor node has a single outgoing edge, just the name -of node is sufficient. For instance, a 'ElasticsearchRetriever' node would always output a single -edge with a list of documents. It can be represented as ["ElasticsearchRetriever"]. +of node is sufficient. For instance, a 'BM25Retriever' node would always output a single +edge with a list of documents. It can be represented as ["BM25Retriever"]. In cases when the predecessor node has multiple outputs, e.g., a "QueryClassifier", the output must be specified explicitly as "QueryClassifier.output_2". @@ -1107,7 +1107,7 @@ Here's a sample configuration: | no_ans_boost: -10 | model_name_or_path: deepset/roberta-base-squad2 | - name: MyESRetriever - | type: ElasticsearchRetriever + | type: BM25Retriever | params: | document_store: MyDocumentStore # params can reference other components defined in the YAML | custom_query: null diff --git a/docs/_src/api/api/ranker.md b/docs/_src/api/api/ranker.md index e9de4491b..3c00788fe 100644 --- a/docs/_src/api/api/ranker.md +++ b/docs/_src/api/api/ranker.md @@ -81,7 +81,7 @@ https://www.sbert.net/docs/pretrained-models/ce-msmarco.html#usage-with-transfor Usage example: ... -retriever = ElasticsearchRetriever(document_store=document_store) +retriever = BM25Retriever(document_store=document_store) ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2") p = Pipeline() p.add_node(component=retriever, name="ESRetriever", inputs=["Query"]) diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index dc46aec85..3e5c99790 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -94,15 +94,15 @@ contains the keys "predictions" and "metrics". # Module sparse - + -## ElasticsearchRetriever +## BM25Retriever ```python -class ElasticsearchRetriever(BaseRetriever) +class BM25Retriever(BaseRetriever) ``` - + #### \_\_init\_\_ @@ -183,7 +183,7 @@ Defaults to False. ``` - `top_k`: How many documents to return per query. - + #### retrieve @@ -209,7 +209,7 @@ Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-c ## ElasticsearchFilterOnlyRetriever ```python -class ElasticsearchFilterOnlyRetriever(ElasticsearchRetriever) +class ElasticsearchFilterOnlyRetriever(BM25Retriever) ``` Naive "Retriever" that returns all documents that match the given filters. No impact of query at all. diff --git a/docs/_src/tutorials/tutorials/1.md b/docs/_src/tutorials/tutorials/1.md index 617ab049c..64c107788 100644 --- a/docs/_src/tutorials/tutorials/1.md +++ b/docs/_src/tutorials/tutorials/1.md @@ -150,16 +150,16 @@ They use some simple but fast algorithm. **Alternatives:** -- Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters +- Customize the `BM25Retriever`with custom queries (e.g. boosting) and filters - Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging - Use `EmbeddingRetriever` to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT) - Use `DensePassageRetriever` to use different embedding models for passage and query (see Tutorial 6) ```python -from haystack.nodes import ElasticsearchRetriever +from haystack.nodes import BM25Retriever -retriever = ElasticsearchRetriever(document_store=document_store) +retriever = BM25Retriever(document_store=document_store) ``` diff --git a/docs/_src/tutorials/tutorials/11.md b/docs/_src/tutorials/tutorials/11.md index 3d86bde55..9d5a9c87e 100644 --- a/docs/_src/tutorials/tutorials/11.md +++ b/docs/_src/tutorials/tutorials/11.md @@ -91,7 +91,7 @@ got_docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, s ``` Here we initialize the core components that we will be gluing together using the `Pipeline` class. -We have a `DocumentStore`, an `ElasticsearchRetriever` and a `FARMReader`. +We have a `DocumentStore`, an `BM25Retriever` and a `FARMReader`. These can be combined to create a classic Retriever-Reader pipeline that is designed to perform Open Domain Question Answering. @@ -100,7 +100,7 @@ to perform Open Domain Question Answering. from haystack import Pipeline from haystack.utils import launch_es from haystack.document_stores import ElasticsearchDocumentStore -from haystack.nodes import ElasticsearchRetriever, EmbeddingRetriever, FARMReader +from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader # Initialize DocumentStore and index documents @@ -110,7 +110,7 @@ document_store.delete_documents() document_store.write_documents(got_docs) # Initialize Sparse retriever -es_retriever = ElasticsearchRetriever(document_store=document_store) +es_retriever = BM25Retriever(document_store=document_store) # Initialize dense retriever embedding_retriever = EmbeddingRetriever( @@ -220,7 +220,7 @@ p_extractive.draw("pipeline_extractive.png") Pipelines offer a very simple way to ensemble together different components. In this example, we are going to combine the power of an `EmbeddingRetriever` -with the keyword based `ElasticsearchRetriever`. +with the keyword based `BM25Retriever`. See our [documentation](https://haystack.deepset.ai/docs/latest/retrievermd) to understand why we might want to combine a dense and sparse retriever. @@ -373,7 +373,7 @@ components: # define all the building-blocks for Pipeline no_ans_boost: -10 model_name_or_path: deepset/roberta-base-squad2 - name: MyESRetriever - type: ElasticsearchRetriever + type: BM25Retriever params: document_store: MyDocumentStore # params can reference other components defined in the YAML custom_query: null diff --git a/docs/_src/tutorials/tutorials/13.md b/docs/_src/tutorials/tutorials/13.md index 77eeeaa69..0c66434e7 100644 --- a/docs/_src/tutorials/tutorials/13.md +++ b/docs/_src/tutorials/tutorials/13.md @@ -38,7 +38,7 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial from pprint import pprint from tqdm import tqdm -from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader +from haystack.nodes import QuestionGenerator, BM25Retriever, FARMReader from haystack.document_stores import ElasticsearchDocumentStore from haystack.pipelines import ( QuestionGenerationPipeline, @@ -112,7 +112,7 @@ This pipeline takes a query as input. It retrieves relevant documents and then g ```python -retriever = ElasticsearchRetriever(document_store=document_store) +retriever = BM25Retriever(document_store=document_store) rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator) print(f"\n * Generating questions for documents matching the query 'Arya Stark'\n") diff --git a/docs/_src/tutorials/tutorials/14.md b/docs/_src/tutorials/tutorials/14.md index 2b168bcda..ef87e20a0 100644 --- a/docs/_src/tutorials/tutorials/14.md +++ b/docs/_src/tutorials/tutorials/14.md @@ -102,7 +102,7 @@ from haystack.utils import ( from haystack.pipelines import Pipeline from haystack.document_stores import ElasticsearchDocumentStore from haystack.nodes import ( - ElasticsearchRetriever, + BM25Retriever, EmbeddingRetriever, FARMReader, TransformersQueryClassifier, @@ -124,7 +124,7 @@ document_store.delete_documents() document_store.write_documents(got_docs) # Initialize Sparse retriever -es_retriever = ElasticsearchRetriever(document_store=document_store) +es_retriever = BM25Retriever(document_store=document_store) # Initialize dense retriever embedding_retriever = EmbeddingRetriever( diff --git a/docs/_src/tutorials/tutorials/15.md b/docs/_src/tutorials/tutorials/15.md index 2512d7276..b9b00851d 100644 --- a/docs/_src/tutorials/tutorials/15.md +++ b/docs/_src/tutorials/tutorials/15.md @@ -10,7 +10,7 @@ id: "tutorial15md" # Open-Domain QA on Tables [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial15_TableQA.ipynb) -This tutorial shows you how to perform question-answering on tables using the `TableTextRetriever` or `ElasticsearchRetriever` as retriever node and the `TableReader` as reader node. +This tutorial shows you how to perform question-answering on tables using the `TableTextRetriever` or `BM25Retriever` as retriever node and the `TableReader` as reader node. ### Prepare environment @@ -142,7 +142,7 @@ of texts and tables using dense embeddings. It is an extension of the `DensePass **Alternatives:** -- `ElasticsearchRetriever` that uses BM25 algorithm +- `BM25Retriever` that uses BM25 algorithm @@ -165,9 +165,9 @@ document_store.update_embeddings(retriever=retriever) ```python -## Alternative: ElasticsearchRetriever -# from haystack.nodes.retriever import ElasticsearchRetriever -# retriever = ElasticsearchRetriever(document_store=document_store) +## Alternative: BM25Retriever +# from haystack.nodes.retriever import BM25Retriever +# retriever = BM25Retriever(document_store=document_store) ``` diff --git a/docs/_src/tutorials/tutorials/16.md b/docs/_src/tutorials/tutorials/16.md index 6bf899789..0385b1646 100644 --- a/docs/_src/tutorials/tutorials/16.md +++ b/docs/_src/tutorials/tutorials/16.md @@ -40,7 +40,7 @@ This tutorial will show you how to integrate a classification model into your pr ```python # Here are the imports we need from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore -from haystack.nodes import PreProcessor, TransformersDocumentClassifier, FARMReader, ElasticsearchRetriever +from haystack.nodes import PreProcessor, TransformersDocumentClassifier, FARMReader, BM25Retriever from haystack.schema import Document from haystack.utils import convert_files_to_docs, fetch_archive_from_http, print_answers ``` @@ -163,7 +163,7 @@ All we have to do to filter for one of our classes is to set a filter on "classi # Initialize QA-Pipeline from haystack.pipelines import ExtractiveQAPipeline -retriever = ElasticsearchRetriever(document_store=document_store) +retriever = BM25Retriever(document_store=document_store) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) pipe = ExtractiveQAPipeline(reader, retriever) ``` diff --git a/docs/_src/tutorials/tutorials/5.md b/docs/_src/tutorials/tutorials/5.md index 88fb18a93..42bc77310 100644 --- a/docs/_src/tutorials/tutorials/5.md +++ b/docs/_src/tutorials/tutorials/5.md @@ -130,9 +130,9 @@ document_store.add_eval_data( ```python # Initialize Retriever -from haystack.nodes import ElasticsearchRetriever +from haystack.nodes import BM25Retriever -retriever = ElasticsearchRetriever(document_store=document_store) +retriever = BM25Retriever(document_store=document_store) # Alternative: Evaluate dense retrievers (EmbeddingRetriever or DensePassageRetriever) # The EmbeddingRetriever uses a single transformer based encoder model for query and document. diff --git a/docs/_src/tutorials/tutorials/6.md b/docs/_src/tutorials/tutorials/6.md index e2099e727..3ba561d2a 100644 --- a/docs/_src/tutorials/tutorials/6.md +++ b/docs/_src/tutorials/tutorials/6.md @@ -155,7 +155,7 @@ document_store.write_documents(docs) **Alternatives:** -- The `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters +- The `BM25Retriever`with custom queries (e.g. boosting) and filters - Use `EmbeddingRetriever` to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT) - Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index 878620782..01b825dfd 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -82,7 +82,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): :param aws4auth: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package) :param index: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one. :param label_index: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one. - :param search_fields: Name of fields used by ElasticsearchRetriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"] + :param search_fields: Name of fields used by BM25Retriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"] :param content_field: Name of field that might contain the answer and will therefore be passed to the Reader Model (e.g. "full_text"). If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will just be returned. :param name_field: Name of field that contains the title of the the doc @@ -1644,7 +1644,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore): :param aws4auth: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package) :param index: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one. :param label_index: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one. - :param search_fields: Name of fields used by ElasticsearchRetriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"] + :param search_fields: Name of fields used by BM25Retriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"] :param content_field: Name of field that might contain the answer and will therefore be passed to the Reader Model (e.g. "full_text"). If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will just be returned. :param name_field: Name of field that contains the title of the the doc diff --git a/haystack/json-schemas/haystack-pipeline-1.1.0.schema.json b/haystack/json-schemas/haystack-pipeline-1.1.0.schema.json index f67b85559..3b7389fbb 100644 --- a/haystack/json-schemas/haystack-pipeline-1.1.0.schema.json +++ b/haystack/json-schemas/haystack-pipeline-1.1.0.schema.json @@ -50,6 +50,9 @@ { "$ref": "#/definitions/AzureConverterComponent" }, + { + "$ref": "#/definitions/BM25RetrieverComponent" + }, { "$ref": "#/definitions/CrawlerComponent" }, @@ -65,9 +68,6 @@ { "$ref": "#/definitions/ElasticsearchFilterOnlyRetrieverComponent" }, - { - "$ref": "#/definitions/ElasticsearchRetrieverComponent" - }, { "$ref": "#/definitions/EmbeddingRetrieverComponent" }, @@ -1226,6 +1226,56 @@ ], "additionalProperties": false }, + "BM25RetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "BM25Retriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "all_terms_must_match": { + "title": "All Terms Must Match", + "default": false, + "type": "boolean" + }, + "custom_query": { + "title": "Custom Query", + "type": "string" + } + }, + "required": [ + "document_store" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "CrawlerComponent": { "type": "object", "properties": { @@ -1546,51 +1596,6 @@ ], "additionalProperties": false }, - "ElasticsearchRetrieverComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "ElasticsearchRetriever" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "document_store": { - "title": "Document Store", - "type": "string" - }, - "top_k": { - "title": "Top K", - "default": 10, - "type": "integer" - }, - "custom_query": { - "title": "Custom Query", - "type": "string" - } - }, - "required": [ - "document_store" - ], - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, "EmbeddingRetrieverComponent": { "type": "object", "properties": { diff --git a/haystack/json-schemas/haystack-pipeline-master.schema.json b/haystack/json-schemas/haystack-pipeline-master.schema.json index 78d8f2066..e7c691917 100644 --- a/haystack/json-schemas/haystack-pipeline-master.schema.json +++ b/haystack/json-schemas/haystack-pipeline-master.schema.json @@ -53,6 +53,9 @@ { "$ref": "#/definitions/AzureConverterComponent" }, + { + "$ref": "#/definitions/BM25RetrieverComponent" + }, { "$ref": "#/definitions/CrawlerComponent" }, @@ -1700,6 +1703,56 @@ ], "additionalProperties": false }, + "BM25RetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "BM25Retriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "all_terms_must_match": { + "title": "All Terms Must Match", + "default": false, + "type": "boolean" + }, + "custom_query": { + "title": "Custom Query", + "type": "string" + } + }, + "required": [ + "document_store" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "CrawlerComponent": { "type": "object", "properties": { diff --git a/haystack/nodes/__init__.py b/haystack/nodes/__init__.py index 45f2a0588..5b285180e 100644 --- a/haystack/nodes/__init__.py +++ b/haystack/nodes/__init__.py @@ -30,6 +30,7 @@ from haystack.nodes.retriever import ( BaseRetriever, DensePassageRetriever, EmbeddingRetriever, + BM25Retriever, ElasticsearchRetriever, ElasticsearchFilterOnlyRetriever, TfidfRetriever, diff --git a/haystack/nodes/document_classifier/transformers.py b/haystack/nodes/document_classifier/transformers.py index 738900564..65c614abc 100644 --- a/haystack/nodes/document_classifier/transformers.py +++ b/haystack/nodes/document_classifier/transformers.py @@ -28,7 +28,7 @@ class TransformersDocumentClassifier(BaseDocumentClassifier): **Usage example at query time:** ```python | ... - | retriever = ElasticsearchRetriever(document_store=document_store) + | retriever = BM25Retriever(document_store=document_store) | document_classifier = TransformersDocumentClassifier(model_name_or_path="bhadresh-savani/distilbert-base-uncased-emotion") | p = Pipeline() | p.add_node(component=retriever, name="Retriever", inputs=["Query"]) diff --git a/haystack/nodes/ranker/sentence_transformers.py b/haystack/nodes/ranker/sentence_transformers.py index 5be04662d..9991ebde6 100644 --- a/haystack/nodes/ranker/sentence_transformers.py +++ b/haystack/nodes/ranker/sentence_transformers.py @@ -28,7 +28,7 @@ class SentenceTransformersRanker(BaseRanker): Usage example: ... - retriever = ElasticsearchRetriever(document_store=document_store) + retriever = BM25Retriever(document_store=document_store) ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2") p = Pipeline() p.add_node(component=retriever, name="ESRetriever", inputs=["Query"]) diff --git a/haystack/nodes/retriever/__init__.py b/haystack/nodes/retriever/__init__.py index 79519f634..27e6e78e2 100644 --- a/haystack/nodes/retriever/__init__.py +++ b/haystack/nodes/retriever/__init__.py @@ -1,4 +1,9 @@ from haystack.nodes.retriever.base import BaseRetriever from haystack.nodes.retriever.dense import DensePassageRetriever, EmbeddingRetriever, TableTextRetriever -from haystack.nodes.retriever.sparse import ElasticsearchRetriever, ElasticsearchFilterOnlyRetriever, TfidfRetriever +from haystack.nodes.retriever.sparse import ( + BM25Retriever, + ElasticsearchRetriever, + ElasticsearchFilterOnlyRetriever, + TfidfRetriever, +) from haystack.nodes.retriever.text2sparql import Text2SparqlRetriever diff --git a/haystack/nodes/retriever/sparse.py b/haystack/nodes/retriever/sparse.py index 29bc35a57..544832f44 100644 --- a/haystack/nodes/retriever/sparse.py +++ b/haystack/nodes/retriever/sparse.py @@ -15,7 +15,7 @@ from haystack.nodes.retriever import BaseRetriever logger = logging.getLogger(__name__) -class ElasticsearchRetriever(BaseRetriever): +class BM25Retriever(BaseRetriever): def __init__( self, document_store: KeywordDocumentStore, @@ -139,7 +139,19 @@ class ElasticsearchRetriever(BaseRetriever): return documents -class ElasticsearchFilterOnlyRetriever(ElasticsearchRetriever): +class ElasticsearchRetriever(BM25Retriever): + def __init__( + self, + document_store: KeywordDocumentStore, + top_k: int = 10, + all_terms_must_match: bool = False, + custom_query: Optional[str] = None, + ): + logger.warn("This class is now deprecated. Please use the BM25Retriever instead") + super().__init__(document_store, top_k, all_terms_must_match, custom_query) + + +class ElasticsearchFilterOnlyRetriever(BM25Retriever): """ Naive "Retriever" that returns all documents that match the given filters. No impact of query at all. Helpful for benchmarking, testing and if you want to do QA on small documents without an "active" retriever. diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index 3e8ff547e..01a53cfe5 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -167,7 +167,7 @@ class BasePipeline(ABC): | }, | { | "name": "MyESRetriever", - | "type": "ElasticsearchRetriever", + | "type": "BM25Retriever", | "params": { | "document_store": "MyDocumentStore", # params can reference other components defined in the YAML | "custom_query": None, @@ -217,7 +217,7 @@ class BasePipeline(ABC): | no_ans_boost: -10 | model_name_or_path: deepset/roberta-base-squad2 | - name: MyESRetriever - | type: ElasticsearchRetriever + | type: BM25Retriever | params: | document_store: MyDocumentStore # params can reference other components defined in the YAML | custom_query: null @@ -492,8 +492,8 @@ class Pipeline(BasePipeline): method to process incoming data from predecessor node. :param name: The name for the node. It must not contain any dots. :param inputs: A list of inputs to the node. If the predecessor node has a single outgoing edge, just the name - of node is sufficient. For instance, a 'ElasticsearchRetriever' node would always output a single - edge with a list of documents. It can be represented as ["ElasticsearchRetriever"]. + of node is sufficient. For instance, a 'BM25Retriever' node would always output a single + edge with a list of documents. It can be represented as ["BM25Retriever"]. In cases when the predecessor node has multiple outputs, e.g., a "QueryClassifier", the output must be specified explicitly as "QueryClassifier.output_2". @@ -1302,7 +1302,7 @@ class Pipeline(BasePipeline): | no_ans_boost: -10 | model_name_or_path: deepset/roberta-base-squad2 | - name: MyESRetriever - | type: ElasticsearchRetriever + | type: BM25Retriever | params: | document_store: MyDocumentStore # params can reference other components defined in the YAML | custom_query: null @@ -1366,7 +1366,7 @@ class Pipeline(BasePipeline): | }, | { | "name": "MyESRetriever", - | "type": "ElasticsearchRetriever", + | "type": "BM25Retriever", | "params": { | "document_store": "MyDocumentStore", # params can reference other components defined in the YAML | "custom_query": None, @@ -1682,7 +1682,7 @@ class RayPipeline(Pipeline): | no_ans_boost: -10 | model_name_or_path: deepset/roberta-base-squad2 | - name: MyESRetriever - | type: ElasticsearchRetriever + | type: BM25Retriever | params: | document_store: MyDocumentStore # params can reference other components defined in the YAML | custom_query: null @@ -1800,8 +1800,8 @@ class RayPipeline(Pipeline): from Python: https://docs.ray.io/en/master/serve/package-ref.html#servehandle-api. :param name: The name for the node. It must not contain any dots. :param inputs: A list of inputs to the node. If the predecessor node has a single outgoing edge, just the name - of node is sufficient. For instance, a 'ElasticsearchRetriever' node would always output a single - edge with a list of documents. It can be represented as ["ElasticsearchRetriever"]. + of node is sufficient. For instance, a 'BM25Retriever' node would always output a single + edge with a list of documents. It can be represented as ["BM25Retriever"]. In cases when the predecessor node has multiple outputs, e.g., a "QueryClassifier", the output must be specified explicitly as "QueryClassifier.output_2". diff --git a/haystack/pipelines/standard_pipelines.py b/haystack/pipelines/standard_pipelines.py index 86aee3f83..459ce08fc 100644 --- a/haystack/pipelines/standard_pipelines.py +++ b/haystack/pipelines/standard_pipelines.py @@ -38,8 +38,8 @@ class BaseStandardPipeline(ABC): method to process incoming data from predecessor node. :param name: The name for the node. It must not contain any dots. :param inputs: A list of inputs to the node. If the predecessor node has a single outgoing edge, just the name - of node is sufficient. For instance, a 'ElasticsearchRetriever' node would always output a single - edge with a list of documents. It can be represented as ["ElasticsearchRetriever"]. + of node is sufficient. For instance, a 'BM25Retriever' node would always output a single + edge with a list of documents. It can be represented as ["BM25Retriever"]. In cases when the predecessor node has multiple outputs, e.g., a "QueryClassifier", the output must be specified explicitly as "QueryClassifier.output_2". @@ -100,7 +100,7 @@ class BaseStandardPipeline(ABC): | no_ans_boost: -10 | model_name_or_path: deepset/roberta-base-squad2 | - name: MyESRetriever - | type: ElasticsearchRetriever + | type: BM25Retriever | params: | document_store: MyDocumentStore # params can reference other components defined in the YAML | custom_query: null diff --git a/haystack/utils/squad_to_dpr.py b/haystack/utils/squad_to_dpr.py index 409802159..2ab5f20a7 100644 --- a/haystack/utils/squad_to_dpr.py +++ b/haystack/utils/squad_to_dpr.py @@ -70,7 +70,7 @@ from elasticsearch import Elasticsearch from haystack.document_stores.base import BaseDocumentStore from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore # keep it here ! from haystack.document_stores.faiss import FAISSDocumentStore # keep it here ! -from haystack.nodes.retriever.sparse import ElasticsearchRetriever # keep it here ! # pylint: disable=unused-import +from haystack.nodes.retriever.sparse import BM25Retriever # keep it here ! # pylint: disable=unused-import from haystack.nodes.retriever.dense import DensePassageRetriever # keep it here ! # pylint: disable=unused-import from haystack.nodes.preprocessor import PreProcessor from haystack.nodes.retriever.base import BaseRetriever @@ -117,10 +117,8 @@ class HaystackDocumentStore: class HaystackRetriever: def __init__(self, document_store: BaseDocumentStore, retriever_type: str, **kwargs): - if retriever_type not in ["ElasticsearchRetriever", "DensePassageRetriever", "EmbeddingRetriever"]: - raise Exception( - "Use one of these types: ElasticsearchRetriever", "DensePassageRetriever", "EmbeddingRetriever" - ) + if retriever_type not in ["BM25Retriever", "DensePassageRetriever", "EmbeddingRetriever"]: + raise Exception("Use one of these types: BM25Retriever", "DensePassageRetriever", "EmbeddingRetriever") self._retriever_type = retriever_type self._document_store = document_store self._kwargs = kwargs @@ -252,7 +250,7 @@ def main( dpr_output_filename: Path, preprocessor, document_store_type_config: Tuple[str, Dict] = ("ElasticsearchDocumentStore", {}), - retriever_type_config: Tuple[str, Dict] = ("ElasticsearchRetriever", {}), + retriever_type_config: Tuple[str, Dict] = ("BM25Retriever", {}), num_hard_negative_ctxs: int = 30, split_dataset: bool = False, ): @@ -348,7 +346,7 @@ if __name__ == "__main__": preprocessor=preprocessor, document_store_type_config=("ElasticsearchDocumentStore", store_dpr_config), # retriever_type_config=("DensePassageRetriever", retriever_dpr_config), # dpr - retriever_type_config=("ElasticsearchRetriever", retriever_bm25_config), # bm25 + retriever_type_config=("BM25Retriever", retriever_bm25_config), # bm25 num_hard_negative_ctxs=num_hard_negative_ctxs, split_dataset=split_dataset, ) diff --git a/rest_api/pipeline/pipelines.haystack-pipeline.yml b/rest_api/pipeline/pipelines.haystack-pipeline.yml index a65fc0a7c..c3935cdc4 100644 --- a/rest_api/pipeline/pipelines.haystack-pipeline.yml +++ b/rest_api/pipeline/pipelines.haystack-pipeline.yml @@ -8,7 +8,7 @@ components: # define all the building-blocks for Pipeline params: host: localhost - name: Retriever - type: ElasticsearchRetriever + type: BM25Retriever params: document_store: DocumentStore # params can reference other components defined in the YAML top_k: 5 diff --git a/test/benchmarks/utils.py b/test/benchmarks/utils.py index bfb481f3e..ef9423bae 100644 --- a/test/benchmarks/utils.py +++ b/test/benchmarks/utils.py @@ -4,7 +4,7 @@ from haystack.document_stores.memory import InMemoryDocumentStore from haystack.document_stores.elasticsearch import Elasticsearch, ElasticsearchDocumentStore, OpenSearchDocumentStore from haystack.document_stores.faiss import FAISSDocumentStore from haystack.document_stores.milvus import MilvusDocumentStore -from haystack.nodes.retriever.sparse import ElasticsearchRetriever, TfidfRetriever +from haystack.nodes.retriever.sparse import BM25Retriever, TfidfRetriever from haystack.nodes.retriever.dense import DensePassageRetriever, EmbeddingRetriever from haystack.nodes.reader.farm import FARMReader from haystack.nodes.reader.transformers import TransformersReader @@ -104,7 +104,7 @@ def get_document_store(document_store_type, similarity="dot_product", index="doc def get_retriever(retriever_name, doc_store, devices): if retriever_name == "elastic": - return ElasticsearchRetriever(doc_store) + return BM25Retriever(doc_store) if retriever_name == "tfidf": return TfidfRetriever(doc_store) if retriever_name == "dpr": diff --git a/test/conftest.py b/test/conftest.py index 8760821a2..398d0d096 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -54,7 +54,7 @@ from haystack.nodes.answer_generator.transformers import Seq2SeqGenerator from haystack.nodes.answer_generator.transformers import RAGenerator from haystack.nodes.ranker import SentenceTransformersRanker from haystack.nodes.document_classifier.transformers import TransformersDocumentClassifier -from haystack.nodes.retriever.sparse import ElasticsearchFilterOnlyRetriever, ElasticsearchRetriever, TfidfRetriever +from haystack.nodes.retriever.sparse import ElasticsearchFilterOnlyRetriever, BM25Retriever, TfidfRetriever from haystack.nodes.retriever.dense import DensePassageRetriever, EmbeddingRetriever, TableTextRetriever from haystack.nodes.reader.farm import FARMReader from haystack.nodes.reader.transformers import TransformersReader @@ -622,7 +622,7 @@ def get_retriever(retriever_type, document_store): embed_title=True, ) elif retriever_type == "elasticsearch": - retriever = ElasticsearchRetriever(document_store=document_store) + retriever = BM25Retriever(document_store=document_store) elif retriever_type == "es_filter_only": retriever = ElasticsearchFilterOnlyRetriever(document_store=document_store) elif retriever_type == "table_text_retriever": diff --git a/test/samples/dc/pipeline_config.json b/test/samples/dc/pipeline_config.json index b197a497c..bc4f08f29 100644 --- a/test/samples/dc/pipeline_config.json +++ b/test/samples/dc/pipeline_config.json @@ -11,7 +11,7 @@ }, { "name": "Retriever", - "type": "ElasticsearchRetriever", + "type": "BM25Retriever", "params": { "document_store": "DocumentStore", "top_k": 5 diff --git a/test/samples/pipeline/test_pipeline.yaml b/test/samples/pipeline/test_pipeline.yaml index 4dea24273..89ab3dc33 100644 --- a/test/samples/pipeline/test_pipeline.yaml +++ b/test/samples/pipeline/test_pipeline.yaml @@ -8,7 +8,7 @@ components: model_name_or_path: deepset/roberta-base-squad2 num_processes: 0 - name: ESRetriever - type: ElasticsearchRetriever + type: BM25Retriever params: document_store: DocumentStore - name: DocumentStore diff --git a/test/samples/pipeline/test_ray_pipeline.yaml b/test/samples/pipeline/test_ray_pipeline.yaml index 95b480fdb..3d7fedc96 100644 --- a/test/samples/pipeline/test_ray_pipeline.yaml +++ b/test/samples/pipeline/test_ray_pipeline.yaml @@ -8,7 +8,7 @@ components: model_name_or_path: deepset/roberta-base-squad2 num_processes: 0 - name: ESRetriever - type: ElasticsearchRetriever + type: BM25Retriever params: document_store: DocumentStore - name: DocumentStore diff --git a/test/test_eval.py b/test/test_eval.py index 6d0ea185b..de2d0b9f4 100644 --- a/test/test_eval.py +++ b/test/test_eval.py @@ -7,7 +7,7 @@ from haystack.nodes.preprocessor import PreProcessor from haystack.nodes.evaluator import EvalAnswers, EvalDocuments from haystack.nodes.query_classifier.transformers import TransformersQueryClassifier from haystack.nodes.retriever.dense import DensePassageRetriever -from haystack.nodes.retriever.sparse import ElasticsearchRetriever +from haystack.nodes.retriever.sparse import BM25Retriever from haystack.pipelines.base import Pipeline from haystack.pipelines import ExtractiveQAPipeline, GenerativeQAPipeline, SearchSummarizationPipeline from haystack.pipelines.standard_pipelines import ( @@ -950,7 +950,7 @@ def test_question_generation_eval(retriever_with_docs, question_generator): @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) @pytest.mark.parametrize("reader", ["farm"], indirect=True) def test_qa_multi_retriever_pipeline_eval(document_store_with_docs, reader): - es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs) + es_retriever = BM25Retriever(document_store=document_store_with_docs) dpr_retriever = DensePassageRetriever(document_store_with_docs) document_store_with_docs.update_embeddings(retriever=dpr_retriever) @@ -1014,7 +1014,7 @@ def test_qa_multi_retriever_pipeline_eval(document_store_with_docs, reader): @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) @pytest.mark.parametrize("reader", ["farm"], indirect=True) def test_multi_retriever_pipeline_eval(document_store_with_docs, reader): - es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs) + es_retriever = BM25Retriever(document_store=document_store_with_docs) dpr_retriever = DensePassageRetriever(document_store_with_docs) document_store_with_docs.update_embeddings(retriever=dpr_retriever) @@ -1073,7 +1073,7 @@ def test_multi_retriever_pipeline_eval(document_store_with_docs, reader): @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) @pytest.mark.parametrize("reader", ["farm"], indirect=True) def test_multi_retriever_pipeline_with_asymmetric_qa_eval(document_store_with_docs, reader): - es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs) + es_retriever = BM25Retriever(document_store=document_store_with_docs) dpr_retriever = DensePassageRetriever(document_store_with_docs) document_store_with_docs.update_embeddings(retriever=dpr_retriever) diff --git a/test/test_extractor.py b/test/test_extractor.py index 885b3f64c..91230e07e 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -1,6 +1,6 @@ import pytest -from haystack.nodes.retriever.sparse import ElasticsearchRetriever +from haystack.nodes.retriever.sparse import BM25Retriever from haystack.nodes.reader import FARMReader from haystack.pipelines import Pipeline @@ -10,7 +10,7 @@ from haystack.nodes.extractor import EntityExtractor, simplify_ner_for_qa @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) def test_extractor(document_store_with_docs): - es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs) + es_retriever = BM25Retriever(document_store=document_store_with_docs) ner = EntityExtractor() reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", num_processes=0) @@ -30,7 +30,7 @@ def test_extractor(document_store_with_docs): @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) def test_extractor_output_simplifier(document_store_with_docs): - es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs) + es_retriever = BM25Retriever(document_store=document_store_with_docs) ner = EntityExtractor() reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", num_processes=0) diff --git a/test/test_pipeline.py b/test/test_pipeline.py index a4b7ce41b..6292b4e90 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -22,7 +22,7 @@ from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore from haystack.nodes.other.join_docs import JoinDocuments from haystack.nodes.base import BaseComponent from haystack.nodes.retriever.base import BaseRetriever -from haystack.nodes.retriever.sparse import ElasticsearchRetriever +from haystack.nodes.retriever.sparse import BM25Retriever from haystack.pipelines import Pipeline, DocumentSearchPipeline, RootNode from haystack.pipelines.config import validate_config_strings from haystack.pipelines.utils import generate_code @@ -409,7 +409,7 @@ def test_generate_code_simple_pipeline(): "components": [ { "name": "retri", - "type": "ElasticsearchRetriever", + "type": "BM25Retriever", "params": {"document_store": "ElasticsearchDocumentStore", "top_k": 20}, }, { @@ -424,7 +424,7 @@ def test_generate_code_simple_pipeline(): code = generate_code(pipeline_config=config, pipeline_variable_name="p", generate_imports=False) assert code == ( 'elasticsearch_document_store = ElasticsearchDocumentStore(index="my-index")\n' - "retri = ElasticsearchRetriever(document_store=elasticsearch_document_store, top_k=20)\n" + "retri = BM25Retriever(document_store=elasticsearch_document_store, top_k=20)\n" "\n" "p = Pipeline()\n" 'p.add_node(component=retri, name="retri", inputs=["Query"])' @@ -436,7 +436,7 @@ def test_generate_code_imports(): "version": "master", "components": [ {"name": "DocumentStore", "type": "ElasticsearchDocumentStore"}, - {"name": "retri", "type": "ElasticsearchRetriever", "params": {"document_store": "DocumentStore"}}, + {"name": "retri", "type": "BM25Retriever", "params": {"document_store": "DocumentStore"}}, {"name": "retri2", "type": "TfidfRetriever", "params": {"document_store": "DocumentStore"}}, ], "pipelines": [ @@ -450,11 +450,11 @@ def test_generate_code_imports(): code = generate_code(pipeline_config=pipeline_config, pipeline_variable_name="p", generate_imports=True) assert code == ( "from haystack.document_stores import ElasticsearchDocumentStore\n" - "from haystack.nodes import ElasticsearchRetriever, TfidfRetriever\n" + "from haystack.nodes import BM25Retriever, TfidfRetriever\n" "from haystack.pipelines import Pipeline\n" "\n" "document_store = ElasticsearchDocumentStore()\n" - "retri = ElasticsearchRetriever(document_store=document_store)\n" + "retri = BM25Retriever(document_store=document_store)\n" "retri_2 = TfidfRetriever(document_store=document_store)\n" "\n" "p = Pipeline()\n" @@ -468,7 +468,7 @@ def test_generate_code_imports_no_pipeline_cls(): "version": "master", "components": [ {"name": "DocumentStore", "type": "ElasticsearchDocumentStore"}, - {"name": "retri", "type": "ElasticsearchRetriever", "params": {"document_store": "DocumentStore"}}, + {"name": "retri", "type": "BM25Retriever", "params": {"document_store": "DocumentStore"}}, ], "pipelines": [{"name": "Query", "nodes": [{"name": "retri", "inputs": ["Query"]}]}], } @@ -481,10 +481,10 @@ def test_generate_code_imports_no_pipeline_cls(): ) assert code == ( "from haystack.document_stores import ElasticsearchDocumentStore\n" - "from haystack.nodes import ElasticsearchRetriever\n" + "from haystack.nodes import BM25Retriever\n" "\n" "document_store = ElasticsearchDocumentStore()\n" - "retri = ElasticsearchRetriever(document_store=document_store)\n" + "retri = BM25Retriever(document_store=document_store)\n" "\n" "p = Pipeline()\n" 'p.add_node(component=retri, name="retri", inputs=["Query"])' @@ -496,7 +496,7 @@ def test_generate_code_comment(): "version": "master", "components": [ {"name": "DocumentStore", "type": "ElasticsearchDocumentStore"}, - {"name": "retri", "type": "ElasticsearchRetriever", "params": {"document_store": "DocumentStore"}}, + {"name": "retri", "type": "BM25Retriever", "params": {"document_store": "DocumentStore"}}, ], "pipelines": [{"name": "Query", "nodes": [{"name": "retri", "inputs": ["Query"]}]}], } @@ -507,11 +507,11 @@ def test_generate_code_comment(): "# This is my comment\n" "# ...and here is a new line\n" "from haystack.document_stores import ElasticsearchDocumentStore\n" - "from haystack.nodes import ElasticsearchRetriever\n" + "from haystack.nodes import BM25Retriever\n" "from haystack.pipelines import Pipeline\n" "\n" "document_store = ElasticsearchDocumentStore()\n" - "retri = ElasticsearchRetriever(document_store=document_store)\n" + "retri = BM25Retriever(document_store=document_store)\n" "\n" "p = Pipeline()\n" 'p.add_node(component=retri, name="retri", inputs=["Query"])' @@ -536,7 +536,7 @@ def test_generate_code_is_component_order_invariant(): doc_store = {"name": "ElasticsearchDocumentStore", "type": "ElasticsearchDocumentStore"} es_retriever = { "name": "EsRetriever", - "type": "ElasticsearchRetriever", + "type": "BM25Retriever", "params": {"document_store": "ElasticsearchDocumentStore"}, } emb_retriever = { @@ -557,7 +557,7 @@ def test_generate_code_is_component_order_invariant(): expected_code = ( "elasticsearch_document_store = ElasticsearchDocumentStore()\n" - "es_retriever = ElasticsearchRetriever(document_store=elasticsearch_document_store)\n" + "es_retriever = BM25Retriever(document_store=elasticsearch_document_store)\n" 'embedding_retriever = EmbeddingRetriever(document_store=elasticsearch_document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2")\n' "join_results = JoinDocuments()\n" "\n" @@ -692,7 +692,7 @@ def test_load_from_deepset_cloud_query(): ) retriever = query_pipeline.get_node("Retriever") document_store = retriever.document_store - assert isinstance(retriever, ElasticsearchRetriever) + assert isinstance(retriever, BM25Retriever) assert isinstance(document_store, DeepsetCloudDocumentStore) assert document_store == query_pipeline.get_document_store() @@ -920,7 +920,7 @@ def test_save_nonexisting_pipeline_to_deepset_cloud(): ) es_document_store = ElasticsearchDocumentStore() - es_retriever = ElasticsearchRetriever(document_store=es_document_store) + es_retriever = BM25Retriever(document_store=es_document_store) file_converter = TextConverter() preprocessor = PreProcessor() diff --git a/test/test_pipeline_debug_and_validation.py b/test/test_pipeline_debug_and_validation.py index 851b8a456..c5fc4aab7 100644 --- a/test/test_pipeline_debug_and_validation.py +++ b/test/test_pipeline_debug_and_validation.py @@ -4,7 +4,7 @@ import json import pytest from haystack.pipelines import Pipeline, RootNode -from haystack.nodes import FARMReader, ElasticsearchRetriever +from haystack.nodes import FARMReader, BM25Retriever from .conftest import SAMPLES_PATH, MockRetriever as BaseMockRetriever, MockReader @@ -26,7 +26,7 @@ class MockRetriever(BaseMockRetriever): def test_node_names_validation(document_store_with_docs, tmp_path): pipeline = Pipeline() pipeline.add_node( - component=ElasticsearchRetriever(document_store=document_store_with_docs), name="Retriever", inputs=["Query"] + component=BM25Retriever(document_store=document_store_with_docs), name="Retriever", inputs=["Query"] ) pipeline.add_node( component=FARMReader(model_name_or_path="deepset/minilm-uncased-squad2", num_processes=0), @@ -56,7 +56,7 @@ def test_node_names_validation(document_store_with_docs, tmp_path): @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) def test_debug_attributes_global(document_store_with_docs, tmp_path): - es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs) + es_retriever = BM25Retriever(document_store=document_store_with_docs) reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2", num_processes=0) pipeline = Pipeline() @@ -86,7 +86,7 @@ def test_debug_attributes_global(document_store_with_docs, tmp_path): @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) def test_debug_attributes_per_node(document_store_with_docs, tmp_path): - es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs) + es_retriever = BM25Retriever(document_store=document_store_with_docs) reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2", num_processes=0) pipeline = Pipeline() @@ -112,7 +112,7 @@ def test_debug_attributes_per_node(document_store_with_docs, tmp_path): @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) def test_global_debug_attributes_override_node_ones(document_store_with_docs, tmp_path): - es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs) + es_retriever = BM25Retriever(document_store=document_store_with_docs) reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2", num_processes=0) pipeline = Pipeline() diff --git a/test/test_retriever.py b/test/test_retriever.py index d0e850911..24c3bd0f5 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -14,7 +14,7 @@ from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore from haystack.document_stores.faiss import FAISSDocumentStore from haystack.document_stores import MilvusDocumentStore from haystack.nodes.retriever.dense import DensePassageRetriever, EmbeddingRetriever, TableTextRetriever -from haystack.nodes.retriever.sparse import ElasticsearchRetriever, ElasticsearchFilterOnlyRetriever, TfidfRetriever +from haystack.nodes.retriever.sparse import BM25Retriever, ElasticsearchFilterOnlyRetriever, TfidfRetriever from transformers import DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast from .conftest import SAMPLES_PATH @@ -70,7 +70,7 @@ def docs(): indirect=True, ) def test_retrieval(retriever_with_docs, document_store_with_docs): - if not isinstance(retriever_with_docs, (ElasticsearchRetriever, ElasticsearchFilterOnlyRetriever, TfidfRetriever)): + if not isinstance(retriever_with_docs, (BM25Retriever, ElasticsearchFilterOnlyRetriever, TfidfRetriever)): document_store_with_docs.update_embeddings(retriever_with_docs) # test without filters @@ -121,7 +121,7 @@ def test_elasticsearch_custom_query(): document_store.write_documents(documents) # test custom "terms" query - retriever = ElasticsearchRetriever( + retriever = BM25Retriever( document_store=document_store, custom_query=""" { @@ -136,7 +136,7 @@ def test_elasticsearch_custom_query(): assert len(results) == 4 # test custom "term" query - retriever = ElasticsearchRetriever( + retriever = BM25Retriever( document_store=document_store, custom_query=""" { @@ -399,7 +399,7 @@ def test_elasticsearch_highlight(): document_store.write_documents(documents) # Enabled highlighting on "title"&"content" field only using custom query - retriever_1 = ElasticsearchRetriever( + retriever_1 = BM25Retriever( document_store=document_store, custom_query="""{ "size": 20, @@ -441,7 +441,7 @@ def test_elasticsearch_highlight(): assert results[0].meta["highlighted"]["content"] == ["The **green**", "**tea** plant", "range of **healthy**"] # Enabled highlighting on "title" field only using custom query - retriever_2 = ElasticsearchRetriever( + retriever_2 = BM25Retriever( document_store=document_store, custom_query="""{ "size": 20, diff --git a/test/test_standard_pipelines.py b/test/test_standard_pipelines.py index d5c5f9e5b..1b476ae7b 100644 --- a/test/test_standard_pipelines.py +++ b/test/test_standard_pipelines.py @@ -9,7 +9,7 @@ from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore from haystack.pipelines import Pipeline, FAQPipeline, DocumentSearchPipeline, RootNode, MostSimilarDocumentsPipeline from haystack.nodes import ( DensePassageRetriever, - ElasticsearchRetriever, + BM25Retriever, SklearnQueryClassifier, TransformersQueryClassifier, JoinDocuments, @@ -111,7 +111,7 @@ def test_most_similar_documents_pipeline(retriever, document_store): @pytest.mark.elasticsearch @pytest.mark.parametrize("document_store_dot_product_with_docs", ["elasticsearch"], indirect=True) def test_join_merge_no_weights(document_store_dot_product_with_docs): - es = ElasticsearchRetriever(document_store=document_store_dot_product_with_docs) + es = BM25Retriever(document_store=document_store_dot_product_with_docs) dpr = DensePassageRetriever( document_store=document_store_dot_product_with_docs, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", @@ -134,7 +134,7 @@ def test_join_merge_no_weights(document_store_dot_product_with_docs): @pytest.mark.elasticsearch @pytest.mark.parametrize("document_store_dot_product_with_docs", ["elasticsearch"], indirect=True) def test_join_merge_with_weights(document_store_dot_product_with_docs): - es = ElasticsearchRetriever(document_store=document_store_dot_product_with_docs) + es = BM25Retriever(document_store=document_store_dot_product_with_docs) dpr = DensePassageRetriever( document_store=document_store_dot_product_with_docs, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", @@ -158,7 +158,7 @@ def test_join_merge_with_weights(document_store_dot_product_with_docs): @pytest.mark.elasticsearch @pytest.mark.parametrize("document_store_dot_product_with_docs", ["elasticsearch"], indirect=True) def test_join_concatenate(document_store_dot_product_with_docs): - es = ElasticsearchRetriever(document_store=document_store_dot_product_with_docs) + es = BM25Retriever(document_store=document_store_dot_product_with_docs) dpr = DensePassageRetriever( document_store=document_store_dot_product_with_docs, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", @@ -181,7 +181,7 @@ def test_join_concatenate(document_store_dot_product_with_docs): @pytest.mark.elasticsearch @pytest.mark.parametrize("document_store_dot_product_with_docs", ["elasticsearch"], indirect=True) def test_join_concatenate_with_topk(document_store_dot_product_with_docs): - es = ElasticsearchRetriever(document_store=document_store_dot_product_with_docs) + es = BM25Retriever(document_store=document_store_dot_product_with_docs) dpr = DensePassageRetriever( document_store=document_store_dot_product_with_docs, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", @@ -207,7 +207,7 @@ def test_join_concatenate_with_topk(document_store_dot_product_with_docs): @pytest.mark.parametrize("document_store_dot_product_with_docs", ["elasticsearch"], indirect=True) @pytest.mark.parametrize("reader", ["farm"], indirect=True) def test_join_with_reader(document_store_dot_product_with_docs, reader): - es = ElasticsearchRetriever(document_store=document_store_dot_product_with_docs) + es = BM25Retriever(document_store=document_store_dot_product_with_docs) dpr = DensePassageRetriever( document_store=document_store_dot_product_with_docs, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", @@ -232,7 +232,7 @@ def test_join_with_reader(document_store_dot_product_with_docs, reader): @pytest.mark.elasticsearch @pytest.mark.parametrize("document_store_dot_product_with_docs", ["elasticsearch"], indirect=True) def test_join_with_rrf(document_store_dot_product_with_docs): - es = ElasticsearchRetriever(document_store=document_store_dot_product_with_docs) + es = BM25Retriever(document_store=document_store_dot_product_with_docs) dpr = DensePassageRetriever( document_store=document_store_dot_product_with_docs, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", diff --git a/tutorials/Tutorial11_Pipelines.ipynb b/tutorials/Tutorial11_Pipelines.ipynb index c23092eb5..737742ae0 100644 --- a/tutorials/Tutorial11_Pipelines.ipynb +++ b/tutorials/Tutorial11_Pipelines.ipynb @@ -191,7 +191,7 @@ }, "source": [ "Here we initialize the core components that we will be gluing together using the `Pipeline` class.\n", - "We have a `DocumentStore`, an `ElasticsearchRetriever` and a `FARMReader`.\n", + "We have a `DocumentStore`, an `BM25Retriever` and a `FARMReader`.\n", "These can be combined to create a classic Retriever-Reader pipeline that is designed\n", "to perform Open Domain Question Answering." ] @@ -210,7 +210,7 @@ "from haystack import Pipeline\n", "from haystack.utils import launch_es\n", "from haystack.document_stores import ElasticsearchDocumentStore\n", - "from haystack.nodes import ElasticsearchRetriever, EmbeddingRetriever, FARMReader\n", + "from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader\n", "\n", "\n", "# Initialize DocumentStore and index documents\n", @@ -220,7 +220,7 @@ "document_store.write_documents(got_docs)\n", "\n", "# Initialize Sparse retriever\n", - "es_retriever = ElasticsearchRetriever(document_store=document_store)\n", + "es_retriever = BM25Retriever(document_store=document_store)\n", "\n", "# Initialize dense retriever\n", "embedding_retriever = EmbeddingRetriever(\n", @@ -434,7 +434,7 @@ "source": [ "Pipelines offer a very simple way to ensemble together different components.\n", "In this example, we are going to combine the power of an `EmbeddingRetriever`\n", - "with the keyword based `ElasticsearchRetriever`.\n", + "with the keyword based `BM25Retriever`.\n", "See our [documentation](https://haystack.deepset.ai/docs/latest/retrievermd) to understand why\n", "we might want to combine a dense and sparse retriever.\n", "\n", @@ -683,7 +683,7 @@ " no_ans_boost: -10\n", " model_name_or_path: deepset/roberta-base-squad2\n", "- name: MyESRetriever\n", - " type: ElasticsearchRetriever\n", + " type: BM25Retriever\n", " params:\n", " document_store: MyDocumentStore # params can reference other components defined in the YAML\n", " custom_query: null\n", diff --git a/tutorials/Tutorial11_Pipelines.py b/tutorials/Tutorial11_Pipelines.py index 1f10d64dd..8a10c7330 100644 --- a/tutorials/Tutorial11_Pipelines.py +++ b/tutorials/Tutorial11_Pipelines.py @@ -9,14 +9,7 @@ from haystack.utils import ( from pprint import pprint from haystack import Pipeline from haystack.document_stores import ElasticsearchDocumentStore -from haystack.nodes import ( - ElasticsearchRetriever, - EmbeddingRetriever, - FARMReader, - RAGenerator, - BaseComponent, - JoinDocuments, -) +from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader, RAGenerator, BaseComponent, JoinDocuments from haystack.pipelines import ExtractiveQAPipeline, DocumentSearchPipeline, GenerativeQAPipeline @@ -36,7 +29,7 @@ def tutorial11_pipelines(): document_store.write_documents(got_docs) # Initialize Sparse retriever - es_retriever = ElasticsearchRetriever(document_store=document_store) + es_retriever = BM25Retriever(document_store=document_store) # Initialize dense retriever embedding_retriever = EmbeddingRetriever( diff --git a/tutorials/Tutorial13_Question_generation.ipynb b/tutorials/Tutorial13_Question_generation.ipynb index b0f343f7d..a7dc74e03 100644 --- a/tutorials/Tutorial13_Question_generation.ipynb +++ b/tutorials/Tutorial13_Question_generation.ipynb @@ -69,7 +69,7 @@ "\n", "from pprint import pprint\n", "from tqdm import tqdm\n", - "from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader\n", + "from haystack.nodes import QuestionGenerator, BM25Retriever, FARMReader\n", "from haystack.document_stores import ElasticsearchDocumentStore\n", "from haystack.pipelines import (\n", " QuestionGenerationPipeline,\n", @@ -228,7 +228,7 @@ }, "outputs": [], "source": [ - "retriever = ElasticsearchRetriever(document_store=document_store)\n", + "retriever = BM25Retriever(document_store=document_store)\n", "rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator)\n", "\n", "print(f\"\\n * Generating questions for documents matching the query 'Arya Stark'\\n\")\n", diff --git a/tutorials/Tutorial13_Question_generation.py b/tutorials/Tutorial13_Question_generation.py index bee285c82..cd2e10261 100644 --- a/tutorials/Tutorial13_Question_generation.py +++ b/tutorials/Tutorial13_Question_generation.py @@ -1,5 +1,5 @@ from tqdm import tqdm -from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader, TransformersTranslator +from haystack.nodes import QuestionGenerator, BM25Retriever, FARMReader, TransformersTranslator from haystack.document_stores import ElasticsearchDocumentStore from haystack.pipelines import ( QuestionGenerationPipeline, @@ -56,7 +56,7 @@ def tutorial13_question_generation(): print("\RetrieverQuestionGenerationPipeline") print("==================================") - retriever = ElasticsearchRetriever(document_store=document_store) + retriever = BM25Retriever(document_store=document_store) rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator) print(f"\n * Generating questions for documents matching the query 'Arya Stark'\n") diff --git a/tutorials/Tutorial14_Query_Classifier.ipynb b/tutorials/Tutorial14_Query_Classifier.ipynb index 5354f22d0..dd56041f9 100644 --- a/tutorials/Tutorial14_Query_Classifier.ipynb +++ b/tutorials/Tutorial14_Query_Classifier.ipynb @@ -382,7 +382,7 @@ "from haystack.pipelines import Pipeline\n", "from haystack.document_stores import ElasticsearchDocumentStore\n", "from haystack.nodes import (\n", - " ElasticsearchRetriever,\n", + " BM25Retriever,\n", " EmbeddingRetriever,\n", " FARMReader,\n", " TransformersQueryClassifier,\n", @@ -404,7 +404,7 @@ "document_store.write_documents(got_docs)\n", "\n", "# Initialize Sparse retriever\n", - "es_retriever = ElasticsearchRetriever(document_store=document_store)\n", + "es_retriever = BM25Retriever(document_store=document_store)\n", "\n", "# Initialize dense retriever\n", "embedding_retriever = EmbeddingRetriever(\n", diff --git a/tutorials/Tutorial14_Query_Classifier.py b/tutorials/Tutorial14_Query_Classifier.py index bb2d9e851..604b87f94 100644 --- a/tutorials/Tutorial14_Query_Classifier.py +++ b/tutorials/Tutorial14_Query_Classifier.py @@ -9,7 +9,7 @@ from haystack.utils import ( from haystack.pipelines import Pipeline from haystack.document_stores import ElasticsearchDocumentStore from haystack.nodes import ( - ElasticsearchRetriever, + BM25Retriever, EmbeddingRetriever, FARMReader, TransformersQueryClassifier, @@ -34,7 +34,7 @@ def tutorial14_query_classifier(): document_store.write_documents(got_docs) # Initialize Sparse retriever - es_retriever = ElasticsearchRetriever(document_store=document_store) + es_retriever = BM25Retriever(document_store=document_store) # Initialize dense retriever embedding_retriever = EmbeddingRetriever( diff --git a/tutorials/Tutorial15_TableQA.ipynb b/tutorials/Tutorial15_TableQA.ipynb index 16348a0cd..7b8e065d9 100644 --- a/tutorials/Tutorial15_TableQA.ipynb +++ b/tutorials/Tutorial15_TableQA.ipynb @@ -9,7 +9,7 @@ "# Open-Domain QA on Tables\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial15_TableQA.ipynb)\n", "\n", - "This tutorial shows you how to perform question-answering on tables using the `TableTextRetriever` or `ElasticsearchRetriever` as retriever node and the `TableReader` as reader node." + "This tutorial shows you how to perform question-answering on tables using the `TableTextRetriever` or `BM25Retriever` as retriever node and the `TableReader` as reader node." ] }, { @@ -243,7 +243,7 @@ "\n", "**Alternatives:**\n", "\n", - "- `ElasticsearchRetriever` that uses BM25 algorithm\n" + "- `BM25Retriever` that uses BM25 algorithm\n" ] }, { @@ -284,9 +284,9 @@ }, "outputs": [], "source": [ - "## Alternative: ElasticsearchRetriever\n", - "# from haystack.nodes.retriever import ElasticsearchRetriever\n", - "# retriever = ElasticsearchRetriever(document_store=document_store)" + "## Alternative: BM25Retriever\n", + "# from haystack.nodes.retriever import BM25Retriever\n", + "# retriever = BM25Retriever(document_store=document_store)" ] }, { diff --git a/tutorials/Tutorial15_TableQA.py b/tutorials/Tutorial15_TableQA.py index 5501b4bc8..040209444 100644 --- a/tutorials/Tutorial15_TableQA.py +++ b/tutorials/Tutorial15_TableQA.py @@ -67,9 +67,9 @@ def tutorial15_tableqa(): # Add table embeddings to the tables in DocumentStore document_store.update_embeddings(retriever=retriever) - ## Alternative: ElasticsearchRetriever - # from haystack.nodes.retriever import ElasticsearchRetriever - # retriever = ElasticsearchRetriever(document_store=document_store) + ## Alternative: BM25Retriever + # from haystack.nodes.retriever import BM25Retriever + # retriever = BM25Retriever(document_store=document_store) # Try the Retriever from haystack.utils import print_documents diff --git a/tutorials/Tutorial16_Document_Classifier_at_Index_Time.ipynb b/tutorials/Tutorial16_Document_Classifier_at_Index_Time.ipynb index 32ec2cce7..4ea4666ab 100644 --- a/tutorials/Tutorial16_Document_Classifier_at_Index_Time.ipynb +++ b/tutorials/Tutorial16_Document_Classifier_at_Index_Time.ipynb @@ -68,7 +68,7 @@ "source": [ "# Here are the imports we need\n", "from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore\n", - "from haystack.nodes import PreProcessor, TransformersDocumentClassifier, FARMReader, ElasticsearchRetriever\n", + "from haystack.nodes import PreProcessor, TransformersDocumentClassifier, FARMReader, BM25Retriever\n", "from haystack.schema import Document\n", "from haystack.utils import convert_files_to_docs, fetch_archive_from_http, print_answers" ] @@ -335,7 +335,7 @@ "# Initialize QA-Pipeline\n", "from haystack.pipelines import ExtractiveQAPipeline\n", "\n", - "retriever = ElasticsearchRetriever(document_store=document_store)\n", + "retriever = BM25Retriever(document_store=document_store)\n", "reader = FARMReader(model_name_or_path=\"deepset/roberta-base-squad2\", use_gpu=True)\n", "pipe = ExtractiveQAPipeline(reader, retriever)" ] diff --git a/tutorials/Tutorial16_Document_Classifier_at_Index_Time.py b/tutorials/Tutorial16_Document_Classifier_at_Index_Time.py index 5d7f23cc8..74613dcd3 100644 --- a/tutorials/Tutorial16_Document_Classifier_at_Index_Time.py +++ b/tutorials/Tutorial16_Document_Classifier_at_Index_Time.py @@ -19,7 +19,7 @@ # Here are the imports we need from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore -from haystack.nodes import PreProcessor, TransformersDocumentClassifier, FARMReader, ElasticsearchRetriever +from haystack.nodes import PreProcessor, TransformersDocumentClassifier, FARMReader, BM25Retriever from haystack.schema import Document from haystack.utils import convert_files_to_docs, fetch_archive_from_http, print_answers, launch_es @@ -98,7 +98,7 @@ def tutorial16_document_classifier_at_index_time(): # Initialize QA-Pipeline from haystack.pipelines import ExtractiveQAPipeline - retriever = ElasticsearchRetriever(document_store=document_store) + retriever = BM25Retriever(document_store=document_store) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) pipe = ExtractiveQAPipeline(reader, retriever) diff --git a/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb index f12076d53..50e9b4dc0 100644 --- a/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb +++ b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb @@ -213,7 +213,7 @@ "\n", "**Alternatives:**\n", "\n", - "- Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters\n", + "- Customize the `BM25Retriever`with custom queries (e.g. boosting) and filters\n", "- Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging\n", "- Use `EmbeddingRetriever` to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT)\n", "- Use `DensePassageRetriever` to use different embedding models for passage and query (see Tutorial 6)" @@ -225,9 +225,9 @@ "metadata": {}, "outputs": [], "source": [ - "from haystack.nodes import ElasticsearchRetriever\n", + "from haystack.nodes import BM25Retriever\n", "\n", - "retriever = ElasticsearchRetriever(document_store=document_store)" + "retriever = BM25Retriever(document_store=document_store)" ] }, { diff --git a/tutorials/Tutorial1_Basic_QA_Pipeline.py b/tutorials/Tutorial1_Basic_QA_Pipeline.py index 221c88692..56567776e 100755 --- a/tutorials/Tutorial1_Basic_QA_Pipeline.py +++ b/tutorials/Tutorial1_Basic_QA_Pipeline.py @@ -11,8 +11,8 @@ import logging from haystack.document_stores import ElasticsearchDocumentStore -from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers, launch_es -from haystack.nodes import FARMReader, TransformersReader, ElasticsearchRetriever +from haystack.utils import clean_wiki_text, convert_files_to_dicts, fetch_archive_from_http, print_answers, launch_es +from haystack.nodes import FARMReader, TransformersReader, BM25Retriever def tutorial1_basic_qa_pipeline(): @@ -75,12 +75,12 @@ def tutorial1_basic_qa_pipeline(): # They use some simple but fast algorithm. # **Here:** We use Elasticsearch's default BM25 algorithm # **Alternatives:** - # - Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters + # - Customize the `BM25Retriever`with custom queries (e.g. boosting) and filters # - Use `EmbeddingRetriever` to find candidate documents based on the similarity of # embeddings (e.g. created via Sentence-BERT) # - Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging - retriever = ElasticsearchRetriever(document_store=document_store) + retriever = BM25Retriever(document_store=document_store) # Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes # with SQLite document store. diff --git a/tutorials/Tutorial5_Evaluation.ipynb b/tutorials/Tutorial5_Evaluation.ipynb index de0919f26..01be0ae78 100644 --- a/tutorials/Tutorial5_Evaluation.ipynb +++ b/tutorials/Tutorial5_Evaluation.ipynb @@ -248,9 +248,9 @@ "outputs": [], "source": [ "# Initialize Retriever\n", - "from haystack.nodes import ElasticsearchRetriever\n", + "from haystack.nodes import BM25Retriever\n", "\n", - "retriever = ElasticsearchRetriever(document_store=document_store)\n", + "retriever = BM25Retriever(document_store=document_store)\n", "\n", "# Alternative: Evaluate dense retrievers (EmbeddingRetriever or DensePassageRetriever)\n", "# The EmbeddingRetriever uses a single transformer based encoder model for query and document.\n", diff --git a/tutorials/Tutorial5_Evaluation.py b/tutorials/Tutorial5_Evaluation.py index 14192537a..b2164c3c5 100644 --- a/tutorials/Tutorial5_Evaluation.py +++ b/tutorials/Tutorial5_Evaluation.py @@ -1,5 +1,5 @@ from haystack.document_stores import ElasticsearchDocumentStore -from haystack.nodes import ElasticsearchRetriever, DensePassageRetriever, EmbeddingRetriever, FARMReader, PreProcessor +from haystack.nodes import BM25Retriever, DensePassageRetriever, EmbeddingRetriever, FARMReader, PreProcessor from haystack.utils import fetch_archive_from_http, launch_es from haystack.pipelines import ExtractiveQAPipeline, DocumentSearchPipeline from haystack.schema import Answer, Document, EvaluationResult, Label, MultiLabel, Span @@ -62,9 +62,9 @@ def tutorial5_evaluation(): ) # Initialize Retriever - from haystack.nodes import ElasticsearchRetriever + from haystack.nodes import BM25Retriever - retriever = ElasticsearchRetriever(document_store=document_store) + retriever = BM25Retriever(document_store=document_store) # Alternative: Evaluate dense retrievers (EmbeddingRetriever or DensePassageRetriever) # The EmbeddingRetriever uses a single transformer based encoder model for query and document. diff --git a/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb b/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb index 9e7a8e481..acf25e346 100644 --- a/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb +++ b/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb @@ -270,7 +270,7 @@ "\n", "**Alternatives:**\n", "\n", - "- The `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters\n", + "- The `BM25Retriever`with custom queries (e.g. boosting) and filters\n", "- Use `EmbeddingRetriever` to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT)\n", "- Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging" ]