mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-27 18:06:17 +00:00
fix: Add separate query method for OpenSearchDocumentStore (#4764)
* Add separate query method for OpenSearchDocumentStore * Convert integration test to unit test + add separate tests for OpenSearch
This commit is contained in:
parent
41b6e33f64
commit
c7a20d68d2
@ -618,6 +618,182 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
||||
query_embs, filters, top_k, index, return_embedding, headers, scale_score
|
||||
)
|
||||
|
||||
def query(
|
||||
self,
|
||||
query: Optional[str],
|
||||
filters: Optional[FilterType] = None,
|
||||
top_k: int = 10,
|
||||
custom_query: Optional[str] = None,
|
||||
index: Optional[str] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
all_terms_must_match: bool = False,
|
||||
scale_score: bool = True,
|
||||
) -> List[Document]:
|
||||
"""
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
that are most relevant to the query as defined by the BM25 algorithm.
|
||||
|
||||
:param query: The query
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
# or simpler using default operators
|
||||
filters = {
|
||||
"type": "article",
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": ["economy", "politics"],
|
||||
"publisher": "nytimes"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
To use the same logical operator multiple times on the same level, logical operators take
|
||||
optionally a list of dictionaries as value.
|
||||
|
||||
__Example__:
|
||||
|
||||
```python
|
||||
filters = {
|
||||
"$or": [
|
||||
{
|
||||
"$and": {
|
||||
"Type": "News Paper",
|
||||
"Date": {
|
||||
"$lt": "2019-01-01"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$and": {
|
||||
"Type": "Blog Post",
|
||||
"Date": {
|
||||
"$gte": "2019-01-01"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
:param top_k: How many documents to return per query.
|
||||
:param custom_query: query string containing a mandatory `${query}` placeholder.
|
||||
|
||||
Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
|
||||
that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..)
|
||||
names must match with the filters dict supplied in self.retrieve().
|
||||
::
|
||||
|
||||
**An example custom_query:**
|
||||
```python
|
||||
{
|
||||
"size": 10,
|
||||
"query": {
|
||||
"bool": {
|
||||
"should": [{"multi_match": {
|
||||
"query": ${query}, // mandatory query placeholder
|
||||
"type": "most_fields",
|
||||
"fields": ["content", "title"]}}],
|
||||
"filter": [ // optional custom filters
|
||||
{"terms": {"year": ${years}}},
|
||||
{"terms": {"quarter": ${quarters}}},
|
||||
{"range": {"date": {"gte": ${date}}}}
|
||||
],
|
||||
}
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
**For this custom_query, a sample retrieve() could be:**
|
||||
```python
|
||||
self.retrieve(query="Why did the revenue increase?",
|
||||
filters={"years": ["2019"], "quarters": ["Q1", "Q2"]})
|
||||
```
|
||||
|
||||
Optionally, highlighting can be defined by specifying the highlight settings.
|
||||
See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html.
|
||||
You will find the highlighted output in the returned Document's meta field by key "highlighted".
|
||||
::
|
||||
|
||||
**Example custom_query with highlighting:**
|
||||
```python
|
||||
{
|
||||
"size": 10,
|
||||
"query": {
|
||||
"bool": {
|
||||
"should": [{"multi_match": {
|
||||
"query": ${query}, // mandatory query placeholder
|
||||
"type": "most_fields",
|
||||
"fields": ["content", "title"]}}],
|
||||
}
|
||||
},
|
||||
"highlight": { // enable highlighting
|
||||
"fields": { // for fields content and title
|
||||
"content": {},
|
||||
"title": {}
|
||||
}
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
**For this custom_query, highlighting info can be accessed by:**
|
||||
```python
|
||||
docs = self.retrieve(query="Why did the revenue increase?")
|
||||
highlighted_content = docs[0].meta["highlighted"]["content"]
|
||||
highlighted_title = docs[0].meta["highlighted"]["title"]
|
||||
```
|
||||
|
||||
:param index: The name of the index in the DocumentStore from which to retrieve documents
|
||||
:param headers: Custom HTTP headers to pass to the client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
|
||||
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
|
||||
:param all_terms_must_match: Whether all terms of the query must match the document.
|
||||
If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
|
||||
Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
|
||||
Defaults to false.
|
||||
:param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]).
|
||||
If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
|
||||
Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
||||
"""
|
||||
|
||||
if index is None:
|
||||
index = self.index
|
||||
|
||||
body = self._construct_query_body(
|
||||
query=query,
|
||||
filters=filters,
|
||||
top_k=top_k,
|
||||
custom_query=custom_query,
|
||||
all_terms_must_match=all_terms_must_match,
|
||||
)
|
||||
|
||||
result = self.client.search(index=index, body=body, headers=headers)["hits"]["hits"]
|
||||
|
||||
documents = [self._convert_es_hit_to_document(hit, scale_score=scale_score) for hit in result]
|
||||
return documents
|
||||
|
||||
def _construct_dense_query_body(
|
||||
self, query_emb: np.ndarray, return_embedding: bool, filters: Optional[FilterType] = None, top_k: int = 10
|
||||
):
|
||||
|
@ -392,6 +392,40 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc
|
||||
_, kwargs = mocked_open_search_init.call_args
|
||||
assert "http_auth" not in kwargs
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_query(self, mocked_document_store):
|
||||
mocked_document_store.query(query=self.query)
|
||||
kwargs = mocked_document_store.client.search.call_args.kwargs
|
||||
assert "index" in kwargs
|
||||
assert "body" in kwargs
|
||||
assert "headers" in kwargs
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_query_return_embedding_false(self, mocked_document_store):
|
||||
mocked_document_store.return_embedding = False
|
||||
mocked_document_store.query(self.query)
|
||||
# assert the resulting body is consistent with the `excluded_meta_data` value
|
||||
_, kwargs = mocked_document_store.client.search.call_args
|
||||
assert kwargs["body"]["_source"] == {"excludes": ["embedding"]}
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_query_excluded_meta_data_return_embedding_true(self, mocked_document_store):
|
||||
mocked_document_store.return_embedding = True
|
||||
mocked_document_store.excluded_meta_data = ["foo", "embedding"]
|
||||
mocked_document_store.query(self.query)
|
||||
_, kwargs = mocked_document_store.client.search.call_args
|
||||
# we expect "embedding" was removed from the final query
|
||||
assert kwargs["body"]["_source"] == {"excludes": ["foo"]}
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_query_excluded_meta_data_return_embedding_false(self, mocked_document_store):
|
||||
mocked_document_store.return_embedding = False
|
||||
mocked_document_store.excluded_meta_data = ["foo"]
|
||||
mocked_document_store.query(self.query)
|
||||
# assert the resulting body is consistent with the `excluded_meta_data` value
|
||||
_, kwargs = mocked_document_store.client.search.call_args
|
||||
assert kwargs["body"]["_source"] == {"excludes": ["foo", "embedding"]}
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_query_by_embedding_raises_if_missing_field(self, mocked_document_store):
|
||||
mocked_document_store.embedding_field = ""
|
||||
|
Loading…
x
Reference in New Issue
Block a user