mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-28 10:26:27 +00:00
fix: Add separate query method for OpenSearchDocumentStore (#4764)
* Add separate query method for OpenSearchDocumentStore * Convert integration test to unit test + add separate tests for OpenSearch
This commit is contained in:
parent
41b6e33f64
commit
c7a20d68d2
@ -618,6 +618,182 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
query_embs, filters, top_k, index, return_embedding, headers, scale_score
|
query_embs, filters, top_k, index, return_embedding, headers, scale_score
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def query(
|
||||||
|
self,
|
||||||
|
query: Optional[str],
|
||||||
|
filters: Optional[FilterType] = None,
|
||||||
|
top_k: int = 10,
|
||||||
|
custom_query: Optional[str] = None,
|
||||||
|
index: Optional[str] = None,
|
||||||
|
headers: Optional[Dict[str, str]] = None,
|
||||||
|
all_terms_must_match: bool = False,
|
||||||
|
scale_score: bool = True,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""
|
||||||
|
Scan through documents in DocumentStore and return a small number documents
|
||||||
|
that are most relevant to the query as defined by the BM25 algorithm.
|
||||||
|
|
||||||
|
:param query: The query
|
||||||
|
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||||
|
conditions.
|
||||||
|
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||||
|
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||||
|
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||||
|
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||||
|
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||||
|
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||||
|
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||||
|
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||||
|
operation.
|
||||||
|
|
||||||
|
__Example__:
|
||||||
|
|
||||||
|
```python
|
||||||
|
filters = {
|
||||||
|
"$and": {
|
||||||
|
"type": {"$eq": "article"},
|
||||||
|
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||||
|
"rating": {"$gte": 3},
|
||||||
|
"$or": {
|
||||||
|
"genre": {"$in": ["economy", "politics"]},
|
||||||
|
"publisher": {"$eq": "nytimes"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
# or simpler using default operators
|
||||||
|
filters = {
|
||||||
|
"type": "article",
|
||||||
|
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||||
|
"rating": {"$gte": 3},
|
||||||
|
"$or": {
|
||||||
|
"genre": ["economy", "politics"],
|
||||||
|
"publisher": "nytimes"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
To use the same logical operator multiple times on the same level, logical operators take
|
||||||
|
optionally a list of dictionaries as value.
|
||||||
|
|
||||||
|
__Example__:
|
||||||
|
|
||||||
|
```python
|
||||||
|
filters = {
|
||||||
|
"$or": [
|
||||||
|
{
|
||||||
|
"$and": {
|
||||||
|
"Type": "News Paper",
|
||||||
|
"Date": {
|
||||||
|
"$lt": "2019-01-01"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$and": {
|
||||||
|
"Type": "Blog Post",
|
||||||
|
"Date": {
|
||||||
|
"$gte": "2019-01-01"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
:param top_k: How many documents to return per query.
|
||||||
|
:param custom_query: query string containing a mandatory `${query}` placeholder.
|
||||||
|
|
||||||
|
Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
|
||||||
|
that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..)
|
||||||
|
names must match with the filters dict supplied in self.retrieve().
|
||||||
|
::
|
||||||
|
|
||||||
|
**An example custom_query:**
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"size": 10,
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"should": [{"multi_match": {
|
||||||
|
"query": ${query}, // mandatory query placeholder
|
||||||
|
"type": "most_fields",
|
||||||
|
"fields": ["content", "title"]}}],
|
||||||
|
"filter": [ // optional custom filters
|
||||||
|
{"terms": {"year": ${years}}},
|
||||||
|
{"terms": {"quarter": ${quarters}}},
|
||||||
|
{"range": {"date": {"gte": ${date}}}}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**For this custom_query, a sample retrieve() could be:**
|
||||||
|
```python
|
||||||
|
self.retrieve(query="Why did the revenue increase?",
|
||||||
|
filters={"years": ["2019"], "quarters": ["Q1", "Q2"]})
|
||||||
|
```
|
||||||
|
|
||||||
|
Optionally, highlighting can be defined by specifying the highlight settings.
|
||||||
|
See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html.
|
||||||
|
You will find the highlighted output in the returned Document's meta field by key "highlighted".
|
||||||
|
::
|
||||||
|
|
||||||
|
**Example custom_query with highlighting:**
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"size": 10,
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"should": [{"multi_match": {
|
||||||
|
"query": ${query}, // mandatory query placeholder
|
||||||
|
"type": "most_fields",
|
||||||
|
"fields": ["content", "title"]}}],
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"highlight": { // enable highlighting
|
||||||
|
"fields": { // for fields content and title
|
||||||
|
"content": {},
|
||||||
|
"title": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**For this custom_query, highlighting info can be accessed by:**
|
||||||
|
```python
|
||||||
|
docs = self.retrieve(query="Why did the revenue increase?")
|
||||||
|
highlighted_content = docs[0].meta["highlighted"]["content"]
|
||||||
|
highlighted_title = docs[0].meta["highlighted"]["title"]
|
||||||
|
```
|
||||||
|
|
||||||
|
:param index: The name of the index in the DocumentStore from which to retrieve documents
|
||||||
|
:param headers: Custom HTTP headers to pass to the client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
|
||||||
|
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
|
||||||
|
:param all_terms_must_match: Whether all terms of the query must match the document.
|
||||||
|
If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
|
||||||
|
Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
|
||||||
|
Defaults to false.
|
||||||
|
:param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]).
|
||||||
|
If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
|
||||||
|
Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if index is None:
|
||||||
|
index = self.index
|
||||||
|
|
||||||
|
body = self._construct_query_body(
|
||||||
|
query=query,
|
||||||
|
filters=filters,
|
||||||
|
top_k=top_k,
|
||||||
|
custom_query=custom_query,
|
||||||
|
all_terms_must_match=all_terms_must_match,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = self.client.search(index=index, body=body, headers=headers)["hits"]["hits"]
|
||||||
|
|
||||||
|
documents = [self._convert_es_hit_to_document(hit, scale_score=scale_score) for hit in result]
|
||||||
|
return documents
|
||||||
|
|
||||||
def _construct_dense_query_body(
|
def _construct_dense_query_body(
|
||||||
self, query_emb: np.ndarray, return_embedding: bool, filters: Optional[FilterType] = None, top_k: int = 10
|
self, query_emb: np.ndarray, return_embedding: bool, filters: Optional[FilterType] = None, top_k: int = 10
|
||||||
):
|
):
|
||||||
|
@ -392,6 +392,40 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc
|
|||||||
_, kwargs = mocked_open_search_init.call_args
|
_, kwargs = mocked_open_search_init.call_args
|
||||||
assert "http_auth" not in kwargs
|
assert "http_auth" not in kwargs
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_query(self, mocked_document_store):
|
||||||
|
mocked_document_store.query(query=self.query)
|
||||||
|
kwargs = mocked_document_store.client.search.call_args.kwargs
|
||||||
|
assert "index" in kwargs
|
||||||
|
assert "body" in kwargs
|
||||||
|
assert "headers" in kwargs
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_query_return_embedding_false(self, mocked_document_store):
|
||||||
|
mocked_document_store.return_embedding = False
|
||||||
|
mocked_document_store.query(self.query)
|
||||||
|
# assert the resulting body is consistent with the `excluded_meta_data` value
|
||||||
|
_, kwargs = mocked_document_store.client.search.call_args
|
||||||
|
assert kwargs["body"]["_source"] == {"excludes": ["embedding"]}
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_query_excluded_meta_data_return_embedding_true(self, mocked_document_store):
|
||||||
|
mocked_document_store.return_embedding = True
|
||||||
|
mocked_document_store.excluded_meta_data = ["foo", "embedding"]
|
||||||
|
mocked_document_store.query(self.query)
|
||||||
|
_, kwargs = mocked_document_store.client.search.call_args
|
||||||
|
# we expect "embedding" was removed from the final query
|
||||||
|
assert kwargs["body"]["_source"] == {"excludes": ["foo"]}
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_query_excluded_meta_data_return_embedding_false(self, mocked_document_store):
|
||||||
|
mocked_document_store.return_embedding = False
|
||||||
|
mocked_document_store.excluded_meta_data = ["foo"]
|
||||||
|
mocked_document_store.query(self.query)
|
||||||
|
# assert the resulting body is consistent with the `excluded_meta_data` value
|
||||||
|
_, kwargs = mocked_document_store.client.search.call_args
|
||||||
|
assert kwargs["body"]["_source"] == {"excludes": ["foo", "embedding"]}
|
||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_query_by_embedding_raises_if_missing_field(self, mocked_document_store):
|
def test_query_by_embedding_raises_if_missing_field(self, mocked_document_store):
|
||||||
mocked_document_store.embedding_field = ""
|
mocked_document_store.embedding_field = ""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user