fix: Add separate query method for OpenSearchDocumentStore (#4764)

* Add separate query method for OpenSearchDocumentStore

* Convert integration test to unit test + add separate tests for OpenSearch
This commit is contained in:
bogdankostic 2023-04-26 21:58:33 +02:00 committed by GitHub
parent 41b6e33f64
commit c7a20d68d2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 210 additions and 0 deletions

View File

@ -618,6 +618,182 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
query_embs, filters, top_k, index, return_embedding, headers, scale_score
)
def query(
self,
query: Optional[str],
filters: Optional[FilterType] = None,
top_k: int = 10,
custom_query: Optional[str] = None,
index: Optional[str] = None,
headers: Optional[Dict[str, str]] = None,
all_terms_must_match: bool = False,
scale_score: bool = True,
) -> List[Document]:
"""
Scan through documents in DocumentStore and return a small number documents
that are most relevant to the query as defined by the BM25 algorithm.
:param query: The query
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
# or simpler using default operators
filters = {
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": ["economy", "politics"],
"publisher": "nytimes"
}
}
```
To use the same logical operator multiple times on the same level, logical operators take
optionally a list of dictionaries as value.
__Example__:
```python
filters = {
"$or": [
{
"$and": {
"Type": "News Paper",
"Date": {
"$lt": "2019-01-01"
}
}
},
{
"$and": {
"Type": "Blog Post",
"Date": {
"$gte": "2019-01-01"
}
}
}
]
}
```
:param top_k: How many documents to return per query.
:param custom_query: query string containing a mandatory `${query}` placeholder.
Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..)
names must match with the filters dict supplied in self.retrieve().
::
**An example custom_query:**
```python
{
"size": 10,
"query": {
"bool": {
"should": [{"multi_match": {
"query": ${query}, // mandatory query placeholder
"type": "most_fields",
"fields": ["content", "title"]}}],
"filter": [ // optional custom filters
{"terms": {"year": ${years}}},
{"terms": {"quarter": ${quarters}}},
{"range": {"date": {"gte": ${date}}}}
],
}
},
}
```
**For this custom_query, a sample retrieve() could be:**
```python
self.retrieve(query="Why did the revenue increase?",
filters={"years": ["2019"], "quarters": ["Q1", "Q2"]})
```
Optionally, highlighting can be defined by specifying the highlight settings.
See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html.
You will find the highlighted output in the returned Document's meta field by key "highlighted".
::
**Example custom_query with highlighting:**
```python
{
"size": 10,
"query": {
"bool": {
"should": [{"multi_match": {
"query": ${query}, // mandatory query placeholder
"type": "most_fields",
"fields": ["content", "title"]}}],
}
},
"highlight": { // enable highlighting
"fields": { // for fields content and title
"content": {},
"title": {}
}
},
}
```
**For this custom_query, highlighting info can be accessed by:**
```python
docs = self.retrieve(query="Why did the revenue increase?")
highlighted_content = docs[0].meta["highlighted"]["content"]
highlighted_title = docs[0].meta["highlighted"]["title"]
```
:param index: The name of the index in the DocumentStore from which to retrieve documents
:param headers: Custom HTTP headers to pass to the client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
:param all_terms_must_match: Whether all terms of the query must match the document.
If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
Defaults to false.
:param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]).
If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
"""
if index is None:
index = self.index
body = self._construct_query_body(
query=query,
filters=filters,
top_k=top_k,
custom_query=custom_query,
all_terms_must_match=all_terms_must_match,
)
result = self.client.search(index=index, body=body, headers=headers)["hits"]["hits"]
documents = [self._convert_es_hit_to_document(hit, scale_score=scale_score) for hit in result]
return documents
def _construct_dense_query_body(
self, query_emb: np.ndarray, return_embedding: bool, filters: Optional[FilterType] = None, top_k: int = 10
):

View File

@ -392,6 +392,40 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc
_, kwargs = mocked_open_search_init.call_args
assert "http_auth" not in kwargs
@pytest.mark.unit
def test_query(self, mocked_document_store):
mocked_document_store.query(query=self.query)
kwargs = mocked_document_store.client.search.call_args.kwargs
assert "index" in kwargs
assert "body" in kwargs
assert "headers" in kwargs
@pytest.mark.unit
def test_query_return_embedding_false(self, mocked_document_store):
mocked_document_store.return_embedding = False
mocked_document_store.query(self.query)
# assert the resulting body is consistent with the `excluded_meta_data` value
_, kwargs = mocked_document_store.client.search.call_args
assert kwargs["body"]["_source"] == {"excludes": ["embedding"]}
@pytest.mark.unit
def test_query_excluded_meta_data_return_embedding_true(self, mocked_document_store):
mocked_document_store.return_embedding = True
mocked_document_store.excluded_meta_data = ["foo", "embedding"]
mocked_document_store.query(self.query)
_, kwargs = mocked_document_store.client.search.call_args
# we expect "embedding" was removed from the final query
assert kwargs["body"]["_source"] == {"excludes": ["foo"]}
@pytest.mark.unit
def test_query_excluded_meta_data_return_embedding_false(self, mocked_document_store):
mocked_document_store.return_embedding = False
mocked_document_store.excluded_meta_data = ["foo"]
mocked_document_store.query(self.query)
# assert the resulting body is consistent with the `excluded_meta_data` value
_, kwargs = mocked_document_store.client.search.call_args
assert kwargs["body"]["_source"] == {"excludes": ["foo", "embedding"]}
@pytest.mark.unit
def test_query_by_embedding_raises_if_missing_field(self, mocked_document_store):
mocked_document_store.embedding_field = ""