mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-07 20:46:31 +00:00
Supported Highlighting in Elasticsearch (#1930)
* Supported Highlighting * Review changes * add example to docstrings * Add latest docstring and tutorial changes * Add latest docstring and tutorial changes Co-authored-by: sowmiya-emplay <sowmiya.j@emplay.net> Co-authored-by: Thomas Stadelmann <thomas.stadelmann@deepset.ai> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: tstadel <60758086+tstadel@users.noreply.github.com>
This commit is contained in:
parent
2edc421a09
commit
c4fff19018
@ -508,7 +508,72 @@ that are most relevant to the query as defined by the BM25 algorithm.
|
||||
- `query`: The query
|
||||
- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `custom_query`: Custom elasticsearch query to be executed.
|
||||
- `custom_query`: query string as per Elasticsearch DSL with a mandatory query placeholder(query).
|
||||
|
||||
Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
|
||||
that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..)
|
||||
names must match with the filters dict supplied in self.retrieve().
|
||||
::
|
||||
|
||||
**An example custom_query:**
|
||||
```python
|
||||
| {
|
||||
| "size": 10,
|
||||
| "query": {
|
||||
| "bool": {
|
||||
| "should": [{"multi_match": {
|
||||
| "query": ${query}, // mandatory query placeholder
|
||||
| "type": "most_fields",
|
||||
| "fields": ["content", "title"]}}],
|
||||
| "filter": [ // optional custom filters
|
||||
| {"terms": {"year": ${years}}},
|
||||
| {"terms": {"quarter": ${quarters}}},
|
||||
| {"range": {"date": {"gte": ${date}}}}
|
||||
| ],
|
||||
| }
|
||||
| },
|
||||
| }
|
||||
```
|
||||
|
||||
**For this custom_query, a sample retrieve() could be:**
|
||||
```python
|
||||
| self.retrieve(query="Why did the revenue increase?",
|
||||
| filters={"years": ["2019"], "quarters": ["Q1", "Q2"]})
|
||||
```
|
||||
|
||||
Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings.
|
||||
See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html.
|
||||
You will find the highlighted output in the returned Document's meta field by key "highlighted".
|
||||
::
|
||||
|
||||
**Example custom_query with highlighting:**
|
||||
```python
|
||||
| {
|
||||
| "size": 10,
|
||||
| "query": {
|
||||
| "bool": {
|
||||
| "should": [{"multi_match": {
|
||||
| "query": ${query}, // mandatory query placeholder
|
||||
| "type": "most_fields",
|
||||
| "fields": ["content", "title"]}}],
|
||||
| }
|
||||
| },
|
||||
| "highlight": { // enable highlighting
|
||||
| "fields": { // for fields content and title
|
||||
| "content": {},
|
||||
| "title": {}
|
||||
| }
|
||||
| },
|
||||
| }
|
||||
```
|
||||
|
||||
**For this custom_query, highlighting info can be accessed by:**
|
||||
```python
|
||||
| docs = self.retrieve(query="Why did the revenue increase?")
|
||||
| highlighted_content = docs[0].meta["highlighted"]["content"]
|
||||
| highlighted_title = docs[0].meta["highlighted"]["title"]
|
||||
```
|
||||
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
|
||||
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
|
||||
|
||||
@ -134,6 +134,40 @@ class ElasticsearchRetriever(BaseRetriever)
|
||||
| self.retrieve(query="Why did the revenue increase?",
|
||||
| filters={"years": ["2019"], "quarters": ["Q1", "Q2"]})
|
||||
```
|
||||
|
||||
Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings.
|
||||
See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html.
|
||||
You will find the highlighted output in the returned Document's meta field by key "highlighted".
|
||||
::
|
||||
|
||||
**Example custom_query with highlighting:**
|
||||
```python
|
||||
| {
|
||||
| "size": 10,
|
||||
| "query": {
|
||||
| "bool": {
|
||||
| "should": [{"multi_match": {
|
||||
| "query": ${query}, // mandatory query placeholder
|
||||
| "type": "most_fields",
|
||||
| "fields": ["content", "title"]}}],
|
||||
| }
|
||||
| },
|
||||
| "highlight": { // enable highlighting
|
||||
| "fields": { // for fields content and title
|
||||
| "content": {},
|
||||
| "title": {}
|
||||
| }
|
||||
| },
|
||||
| }
|
||||
```
|
||||
|
||||
**For this custom_query, highlighting info can be accessed by:**
|
||||
```python
|
||||
| docs = self.retrieve(query="Why did the revenue increase?")
|
||||
| highlighted_content = docs[0].meta["highlighted"]["content"]
|
||||
| highlighted_title = docs[0].meta["highlighted"]["title"]
|
||||
```
|
||||
|
||||
- `top_k`: How many documents to return per query.
|
||||
|
||||
<a name="sparse.ElasticsearchRetriever.retrieve"></a>
|
||||
|
||||
@ -186,7 +186,7 @@ for question in QUESTIONS:
|
||||
|
||||
# Print you answer
|
||||
answers = predicted_result["answers"]
|
||||
print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')
|
||||
print(f'Generated answer is \'{answers[0].answer}\' for the question = \'{question}\'')
|
||||
```
|
||||
|
||||
|
||||
|
||||
@ -745,7 +745,72 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
:param query: The query
|
||||
:param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
:param top_k: How many documents to return per query.
|
||||
:param custom_query: Custom elasticsearch query to be executed.
|
||||
:param custom_query: query string as per Elasticsearch DSL with a mandatory query placeholder(query).
|
||||
|
||||
Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
|
||||
that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..)
|
||||
names must match with the filters dict supplied in self.retrieve().
|
||||
::
|
||||
|
||||
**An example custom_query:**
|
||||
```python
|
||||
| {
|
||||
| "size": 10,
|
||||
| "query": {
|
||||
| "bool": {
|
||||
| "should": [{"multi_match": {
|
||||
| "query": ${query}, // mandatory query placeholder
|
||||
| "type": "most_fields",
|
||||
| "fields": ["content", "title"]}}],
|
||||
| "filter": [ // optional custom filters
|
||||
| {"terms": {"year": ${years}}},
|
||||
| {"terms": {"quarter": ${quarters}}},
|
||||
| {"range": {"date": {"gte": ${date}}}}
|
||||
| ],
|
||||
| }
|
||||
| },
|
||||
| }
|
||||
```
|
||||
|
||||
**For this custom_query, a sample retrieve() could be:**
|
||||
```python
|
||||
| self.retrieve(query="Why did the revenue increase?",
|
||||
| filters={"years": ["2019"], "quarters": ["Q1", "Q2"]})
|
||||
```
|
||||
|
||||
Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings.
|
||||
See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html.
|
||||
You will find the highlighted output in the returned Document's meta field by key "highlighted".
|
||||
::
|
||||
|
||||
**Example custom_query with highlighting:**
|
||||
```python
|
||||
| {
|
||||
| "size": 10,
|
||||
| "query": {
|
||||
| "bool": {
|
||||
| "should": [{"multi_match": {
|
||||
| "query": ${query}, // mandatory query placeholder
|
||||
| "type": "most_fields",
|
||||
| "fields": ["content", "title"]}}],
|
||||
| }
|
||||
| },
|
||||
| "highlight": { // enable highlighting
|
||||
| "fields": { // for fields content and title
|
||||
| "content": {},
|
||||
| "title": {}
|
||||
| }
|
||||
| },
|
||||
| }
|
||||
```
|
||||
|
||||
**For this custom_query, highlighting info can be accessed by:**
|
||||
```python
|
||||
| docs = self.retrieve(query="Why did the revenue increase?")
|
||||
| highlighted_content = docs[0].meta["highlighted"]["content"]
|
||||
| highlighted_title = docs[0].meta["highlighted"]["title"]
|
||||
```
|
||||
|
||||
:param index: The name of the index in the DocumentStore from which to retrieve documents
|
||||
:param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
|
||||
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
|
||||
@ -963,6 +1028,9 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
if name:
|
||||
meta_data["name"] = name
|
||||
|
||||
if 'highlight' in hit:
|
||||
meta_data['highlighted'] = hit['highlight']
|
||||
|
||||
score = hit["_score"]
|
||||
if score:
|
||||
if adapt_score_for_embedding:
|
||||
|
||||
@ -49,6 +49,40 @@ class ElasticsearchRetriever(BaseRetriever):
|
||||
| self.retrieve(query="Why did the revenue increase?",
|
||||
| filters={"years": ["2019"], "quarters": ["Q1", "Q2"]})
|
||||
```
|
||||
|
||||
Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings.
|
||||
See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html.
|
||||
You will find the highlighted output in the returned Document's meta field by key "highlighted".
|
||||
::
|
||||
|
||||
**Example custom_query with highlighting:**
|
||||
```python
|
||||
| {
|
||||
| "size": 10,
|
||||
| "query": {
|
||||
| "bool": {
|
||||
| "should": [{"multi_match": {
|
||||
| "query": ${query}, // mandatory query placeholder
|
||||
| "type": "most_fields",
|
||||
| "fields": ["content", "title"]}}],
|
||||
| }
|
||||
| },
|
||||
| "highlight": { // enable highlighting
|
||||
| "fields": { // for fields content and title
|
||||
| "content": {},
|
||||
| "title": {}
|
||||
| }
|
||||
| },
|
||||
| }
|
||||
```
|
||||
|
||||
**For this custom_query, highlighting info can be accessed by:**
|
||||
```python
|
||||
| docs = self.retrieve(query="Why did the revenue increase?")
|
||||
| highlighted_content = docs[0].meta["highlighted"]["content"]
|
||||
| highlighted_title = docs[0].meta["highlighted"]["title"]
|
||||
```
|
||||
|
||||
:param top_k: How many documents to return per query.
|
||||
"""
|
||||
# save init parameters to enable export of component config as YAML
|
||||
|
||||
@ -351,3 +351,114 @@ def test_table_text_retriever_training(document_store):
|
||||
|
||||
# Load trained model
|
||||
retriever = TableTextRetriever.load(load_dir="test_table_text_retriever_train", document_store=document_store)
|
||||
|
||||
|
||||
@pytest.mark.elasticsearch
|
||||
def test_elasticsearch_highlight():
|
||||
client = Elasticsearch()
|
||||
client.indices.delete(index="haystack_hl_test", ignore=[404])
|
||||
|
||||
# Mapping the content and title field as "text" perform search on these both fields.
|
||||
document_store = ElasticsearchDocumentStore(index="haystack_hl_test", content_field= "title",
|
||||
custom_mapping={
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"content": {
|
||||
"type": "text"
|
||||
},
|
||||
"title": {
|
||||
"type": "text"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
documents = [
|
||||
{"title": "Green tea components", "meta":{"content": "The green tea plant contains a range of healthy compounds that make it into the final drink"}, "id":"1"},
|
||||
{"title": "Green tea catechin", "meta":{"content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG)."}, "id":"2"},
|
||||
{"title": "Minerals in Green tea", "meta":{"content": "Green tea also has small amounts of minerals that can benefit your health."}, "id":"3"},
|
||||
{"title": "Green tea Benefits", "meta":{"content": "Green tea does more than just keep you alert, it may also help boost brain function."}, "id":"4"}
|
||||
]
|
||||
document_store.write_documents(documents)
|
||||
|
||||
# Enabled highlighting on "title"&"content" field only using custom query
|
||||
retriever_1 = ElasticsearchRetriever(document_store=document_store,
|
||||
custom_query=
|
||||
"""{
|
||||
"size": 20,
|
||||
"query": {
|
||||
"bool": {
|
||||
"should": [
|
||||
{
|
||||
"multi_match": {
|
||||
"query": ${query},
|
||||
"fields": [
|
||||
"content^3",
|
||||
"title^5"
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"highlight": {
|
||||
"pre_tags": [
|
||||
"**"
|
||||
],
|
||||
"post_tags": [
|
||||
"**"
|
||||
],
|
||||
"number_of_fragments": 3,
|
||||
"fragment_size": 5,
|
||||
"fields": {
|
||||
"content": {},
|
||||
"title": {}
|
||||
}
|
||||
}
|
||||
}""",
|
||||
)
|
||||
results = retriever_1.retrieve(query="is green tea healthy")
|
||||
|
||||
assert len(results[0].meta['highlighted']) == 2
|
||||
assert results[0].meta['highlighted']['title'] == ['**Green**', '**tea** components']
|
||||
assert results[0].meta['highlighted']['content'] == ['The **green**', '**tea** plant', 'range of **healthy**']
|
||||
|
||||
#Enabled highlighting on "title" field only using custom query
|
||||
retriever_2 = ElasticsearchRetriever(document_store=document_store,
|
||||
custom_query=
|
||||
"""{
|
||||
"size": 20,
|
||||
"query": {
|
||||
"bool": {
|
||||
"should": [
|
||||
{
|
||||
"multi_match": {
|
||||
"query": ${query},
|
||||
"fields": [
|
||||
"content^3",
|
||||
"title^5"
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"highlight": {
|
||||
"pre_tags": [
|
||||
"**"
|
||||
],
|
||||
"post_tags": [
|
||||
"**"
|
||||
],
|
||||
"number_of_fragments": 3,
|
||||
"fragment_size": 5,
|
||||
"fields": {
|
||||
"title": {}
|
||||
}
|
||||
}
|
||||
}""",
|
||||
)
|
||||
results = retriever_2.retrieve(query="is green tea healthy")
|
||||
|
||||
assert len(results[0].meta['highlighted']) == 1
|
||||
assert results[0].meta['highlighted']['title'] == ['**Green**', '**tea** components']
|
||||
Loading…
x
Reference in New Issue
Block a user