diff --git a/haystack/document_stores/opensearch.py b/haystack/document_stores/opensearch.py index 38e2fb9de..0d0d9dd4d 100644 --- a/haystack/document_stores/opensearch.py +++ b/haystack/document_stores/opensearch.py @@ -618,6 +618,182 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore): query_embs, filters, top_k, index, return_embedding, headers, scale_score ) + def query( + self, + query: Optional[str], + filters: Optional[FilterType] = None, + top_k: int = 10, + custom_query: Optional[str] = None, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + all_terms_must_match: bool = False, + scale_score: bool = True, + ) -> List[Document]: + """ + Scan through documents in DocumentStore and return a small number documents + that are most relevant to the query as defined by the BM25 algorithm. + + :param query: The query + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` + :param top_k: How many documents to return per query. + :param custom_query: query string containing a mandatory `${query}` placeholder. + + Optionally, ES `filter` clause can be added where the values of `terms` are placeholders + that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..) + names must match with the filters dict supplied in self.retrieve(). + :: + + **An example custom_query:** + ```python + { + "size": 10, + "query": { + "bool": { + "should": [{"multi_match": { + "query": ${query}, // mandatory query placeholder + "type": "most_fields", + "fields": ["content", "title"]}}], + "filter": [ // optional custom filters + {"terms": {"year": ${years}}}, + {"terms": {"quarter": ${quarters}}}, + {"range": {"date": {"gte": ${date}}}} + ], + } + }, + } + ``` + + **For this custom_query, a sample retrieve() could be:** + ```python + self.retrieve(query="Why did the revenue increase?", + filters={"years": ["2019"], "quarters": ["Q1", "Q2"]}) + ``` + + Optionally, highlighting can be defined by specifying the highlight settings. + See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html. + You will find the highlighted output in the returned Document's meta field by key "highlighted". + :: + + **Example custom_query with highlighting:** + ```python + { + "size": 10, + "query": { + "bool": { + "should": [{"multi_match": { + "query": ${query}, // mandatory query placeholder + "type": "most_fields", + "fields": ["content", "title"]}}], + } + }, + "highlight": { // enable highlighting + "fields": { // for fields content and title + "content": {}, + "title": {} + } + }, + } + ``` + + **For this custom_query, highlighting info can be accessed by:** + ```python + docs = self.retrieve(query="Why did the revenue increase?") + highlighted_content = docs[0].meta["highlighted"]["content"] + highlighted_title = docs[0].meta["highlighted"]["title"] + ``` + + :param index: The name of the index in the DocumentStore from which to retrieve documents + :param headers: Custom HTTP headers to pass to the client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + :param all_terms_must_match: Whether all terms of the query must match the document. + If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). + Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). + Defaults to false. + :param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]). + If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. + Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + """ + + if index is None: + index = self.index + + body = self._construct_query_body( + query=query, + filters=filters, + top_k=top_k, + custom_query=custom_query, + all_terms_must_match=all_terms_must_match, + ) + + result = self.client.search(index=index, body=body, headers=headers)["hits"]["hits"] + + documents = [self._convert_es_hit_to_document(hit, scale_score=scale_score) for hit in result] + return documents + def _construct_dense_query_body( self, query_emb: np.ndarray, return_embedding: bool, filters: Optional[FilterType] = None, top_k: int = 10 ): diff --git a/test/document_stores/test_opensearch.py b/test/document_stores/test_opensearch.py index df070bc84..47bc06c31 100644 --- a/test/document_stores/test_opensearch.py +++ b/test/document_stores/test_opensearch.py @@ -392,6 +392,40 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc _, kwargs = mocked_open_search_init.call_args assert "http_auth" not in kwargs + @pytest.mark.unit + def test_query(self, mocked_document_store): + mocked_document_store.query(query=self.query) + kwargs = mocked_document_store.client.search.call_args.kwargs + assert "index" in kwargs + assert "body" in kwargs + assert "headers" in kwargs + + @pytest.mark.unit + def test_query_return_embedding_false(self, mocked_document_store): + mocked_document_store.return_embedding = False + mocked_document_store.query(self.query) + # assert the resulting body is consistent with the `excluded_meta_data` value + _, kwargs = mocked_document_store.client.search.call_args + assert kwargs["body"]["_source"] == {"excludes": ["embedding"]} + + @pytest.mark.unit + def test_query_excluded_meta_data_return_embedding_true(self, mocked_document_store): + mocked_document_store.return_embedding = True + mocked_document_store.excluded_meta_data = ["foo", "embedding"] + mocked_document_store.query(self.query) + _, kwargs = mocked_document_store.client.search.call_args + # we expect "embedding" was removed from the final query + assert kwargs["body"]["_source"] == {"excludes": ["foo"]} + + @pytest.mark.unit + def test_query_excluded_meta_data_return_embedding_false(self, mocked_document_store): + mocked_document_store.return_embedding = False + mocked_document_store.excluded_meta_data = ["foo"] + mocked_document_store.query(self.query) + # assert the resulting body is consistent with the `excluded_meta_data` value + _, kwargs = mocked_document_store.client.search.call_args + assert kwargs["body"]["_source"] == {"excludes": ["foo", "embedding"]} + @pytest.mark.unit def test_query_by_embedding_raises_if_missing_field(self, mocked_document_store): mocked_document_store.embedding_field = ""