diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index ef1e03781..dc0eb21c2 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -508,7 +508,72 @@ that are most relevant to the query as defined by the BM25 algorithm. - `query`: The query - `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field - `top_k`: How many documents to return per query. -- `custom_query`: Custom elasticsearch query to be executed. +- `custom_query`: query string as per Elasticsearch DSL with a mandatory query placeholder(query). + + Optionally, ES `filter` clause can be added where the values of `terms` are placeholders + that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..) + names must match with the filters dict supplied in self.retrieve(). + :: + + **An example custom_query:** + ```python + | { + | "size": 10, + | "query": { + | "bool": { + | "should": [{"multi_match": { + | "query": ${query}, // mandatory query placeholder + | "type": "most_fields", + | "fields": ["content", "title"]}}], + | "filter": [ // optional custom filters + | {"terms": {"year": ${years}}}, + | {"terms": {"quarter": ${quarters}}}, + | {"range": {"date": {"gte": ${date}}}} + | ], + | } + | }, + | } + ``` + + **For this custom_query, a sample retrieve() could be:** + ```python + | self.retrieve(query="Why did the revenue increase?", + | filters={"years": ["2019"], "quarters": ["Q1", "Q2"]}) + ``` + + Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings. + See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html. + You will find the highlighted output in the returned Document's meta field by key "highlighted". + :: + + **Example custom_query with highlighting:** + ```python + | { + | "size": 10, + | "query": { + | "bool": { + | "should": [{"multi_match": { + | "query": ${query}, // mandatory query placeholder + | "type": "most_fields", + | "fields": ["content", "title"]}}], + | } + | }, + | "highlight": { // enable highlighting + | "fields": { // for fields content and title + | "content": {}, + | "title": {} + | } + | }, + | } + ``` + + **For this custom_query, highlighting info can be accessed by:** + ```python + | docs = self.retrieve(query="Why did the revenue increase?") + | highlighted_content = docs[0].meta["highlighted"]["content"] + | highlighted_title = docs[0].meta["highlighted"]["title"] + ``` + - `index`: The name of the index in the DocumentStore from which to retrieve documents - `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index e2c95f3f3..8b44d4eb2 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -134,6 +134,40 @@ class ElasticsearchRetriever(BaseRetriever) | self.retrieve(query="Why did the revenue increase?", | filters={"years": ["2019"], "quarters": ["Q1", "Q2"]}) ``` + + Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings. + See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html. + You will find the highlighted output in the returned Document's meta field by key "highlighted". + :: + + **Example custom_query with highlighting:** + ```python + | { + | "size": 10, + | "query": { + | "bool": { + | "should": [{"multi_match": { + | "query": ${query}, // mandatory query placeholder + | "type": "most_fields", + | "fields": ["content", "title"]}}], + | } + | }, + | "highlight": { // enable highlighting + | "fields": { // for fields content and title + | "content": {}, + | "title": {} + | } + | }, + | } + ``` + + **For this custom_query, highlighting info can be accessed by:** + ```python + | docs = self.retrieve(query="Why did the revenue increase?") + | highlighted_content = docs[0].meta["highlighted"]["content"] + | highlighted_title = docs[0].meta["highlighted"]["title"] + ``` + - `top_k`: How many documents to return per query. diff --git a/docs/_src/tutorials/tutorials/7.md b/docs/_src/tutorials/tutorials/7.md index e1e9ad12c..758aed024 100644 --- a/docs/_src/tutorials/tutorials/7.md +++ b/docs/_src/tutorials/tutorials/7.md @@ -186,7 +186,7 @@ for question in QUESTIONS: # Print you answer answers = predicted_result["answers"] - print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'') + print(f'Generated answer is \'{answers[0].answer}\' for the question = \'{question}\'') ``` diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index 04ec2b4fd..3fa7351a7 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -745,7 +745,72 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): :param query: The query :param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field :param top_k: How many documents to return per query. - :param custom_query: Custom elasticsearch query to be executed. + :param custom_query: query string as per Elasticsearch DSL with a mandatory query placeholder(query). + + Optionally, ES `filter` clause can be added where the values of `terms` are placeholders + that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..) + names must match with the filters dict supplied in self.retrieve(). + :: + + **An example custom_query:** + ```python + | { + | "size": 10, + | "query": { + | "bool": { + | "should": [{"multi_match": { + | "query": ${query}, // mandatory query placeholder + | "type": "most_fields", + | "fields": ["content", "title"]}}], + | "filter": [ // optional custom filters + | {"terms": {"year": ${years}}}, + | {"terms": {"quarter": ${quarters}}}, + | {"range": {"date": {"gte": ${date}}}} + | ], + | } + | }, + | } + ``` + + **For this custom_query, a sample retrieve() could be:** + ```python + | self.retrieve(query="Why did the revenue increase?", + | filters={"years": ["2019"], "quarters": ["Q1", "Q2"]}) + ``` + + Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings. + See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html. + You will find the highlighted output in the returned Document's meta field by key "highlighted". + :: + + **Example custom_query with highlighting:** + ```python + | { + | "size": 10, + | "query": { + | "bool": { + | "should": [{"multi_match": { + | "query": ${query}, // mandatory query placeholder + | "type": "most_fields", + | "fields": ["content", "title"]}}], + | } + | }, + | "highlight": { // enable highlighting + | "fields": { // for fields content and title + | "content": {}, + | "title": {} + | } + | }, + | } + ``` + + **For this custom_query, highlighting info can be accessed by:** + ```python + | docs = self.retrieve(query="Why did the revenue increase?") + | highlighted_content = docs[0].meta["highlighted"]["content"] + | highlighted_title = docs[0].meta["highlighted"]["title"] + ``` + :param index: The name of the index in the DocumentStore from which to retrieve documents :param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. @@ -963,6 +1028,9 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): if name: meta_data["name"] = name + if 'highlight' in hit: + meta_data['highlighted'] = hit['highlight'] + score = hit["_score"] if score: if adapt_score_for_embedding: diff --git a/haystack/nodes/retriever/sparse.py b/haystack/nodes/retriever/sparse.py index ed5cf5540..e98595121 100644 --- a/haystack/nodes/retriever/sparse.py +++ b/haystack/nodes/retriever/sparse.py @@ -49,6 +49,40 @@ class ElasticsearchRetriever(BaseRetriever): | self.retrieve(query="Why did the revenue increase?", | filters={"years": ["2019"], "quarters": ["Q1", "Q2"]}) ``` + + Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings. + See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html. + You will find the highlighted output in the returned Document's meta field by key "highlighted". + :: + + **Example custom_query with highlighting:** + ```python + | { + | "size": 10, + | "query": { + | "bool": { + | "should": [{"multi_match": { + | "query": ${query}, // mandatory query placeholder + | "type": "most_fields", + | "fields": ["content", "title"]}}], + | } + | }, + | "highlight": { // enable highlighting + | "fields": { // for fields content and title + | "content": {}, + | "title": {} + | } + | }, + | } + ``` + + **For this custom_query, highlighting info can be accessed by:** + ```python + | docs = self.retrieve(query="Why did the revenue increase?") + | highlighted_content = docs[0].meta["highlighted"]["content"] + | highlighted_title = docs[0].meta["highlighted"]["title"] + ``` + :param top_k: How many documents to return per query. """ # save init parameters to enable export of component config as YAML diff --git a/test/test_retriever.py b/test/test_retriever.py index 8b479e65d..057e6e3c8 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -351,3 +351,114 @@ def test_table_text_retriever_training(document_store): # Load trained model retriever = TableTextRetriever.load(load_dir="test_table_text_retriever_train", document_store=document_store) + + +@pytest.mark.elasticsearch +def test_elasticsearch_highlight(): + client = Elasticsearch() + client.indices.delete(index="haystack_hl_test", ignore=[404]) + + # Mapping the content and title field as "text" perform search on these both fields. + document_store = ElasticsearchDocumentStore(index="haystack_hl_test", content_field= "title", + custom_mapping={ + "mappings": { + "properties": { + "content": { + "type": "text" + }, + "title": { + "type": "text" + } + } + } + } + ) + documents = [ + {"title": "Green tea components", "meta":{"content": "The green tea plant contains a range of healthy compounds that make it into the final drink"}, "id":"1"}, + {"title": "Green tea catechin", "meta":{"content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG)."}, "id":"2"}, + {"title": "Minerals in Green tea", "meta":{"content": "Green tea also has small amounts of minerals that can benefit your health."}, "id":"3"}, + {"title": "Green tea Benefits", "meta":{"content": "Green tea does more than just keep you alert, it may also help boost brain function."}, "id":"4"} + ] + document_store.write_documents(documents) + + # Enabled highlighting on "title"&"content" field only using custom query + retriever_1 = ElasticsearchRetriever(document_store=document_store, + custom_query= + """{ + "size": 20, + "query": { + "bool": { + "should": [ + { + "multi_match": { + "query": ${query}, + "fields": [ + "content^3", + "title^5" + ] + } + } + ] + } + }, + "highlight": { + "pre_tags": [ + "**" + ], + "post_tags": [ + "**" + ], + "number_of_fragments": 3, + "fragment_size": 5, + "fields": { + "content": {}, + "title": {} + } + } + }""", + ) + results = retriever_1.retrieve(query="is green tea healthy") + + assert len(results[0].meta['highlighted']) == 2 + assert results[0].meta['highlighted']['title'] == ['**Green**', '**tea** components'] + assert results[0].meta['highlighted']['content'] == ['The **green**', '**tea** plant', 'range of **healthy**'] + + #Enabled highlighting on "title" field only using custom query + retriever_2 = ElasticsearchRetriever(document_store=document_store, + custom_query= + """{ + "size": 20, + "query": { + "bool": { + "should": [ + { + "multi_match": { + "query": ${query}, + "fields": [ + "content^3", + "title^5" + ] + } + } + ] + } + }, + "highlight": { + "pre_tags": [ + "**" + ], + "post_tags": [ + "**" + ], + "number_of_fragments": 3, + "fragment_size": 5, + "fields": { + "title": {} + } + } + }""", + ) + results = retriever_2.retrieve(query="is green tea healthy") + + assert len(results[0].meta['highlighted']) == 1 + assert results[0].meta['highlighted']['title'] == ['**Green**', '**tea** components'] \ No newline at end of file