fix: Add separate query method for OpenSearchDocumentStore (#4764)

* Add separate query method for OpenSearchDocumentStore * Convert integration test to unit test + add separate tests for OpenSearch
2025-12-12 07:17:41 +00:00 · 2023-04-26 21:58:33 +02:00 · 2023-04-26 21:58:33 +02:00 · c7a20d68d2
commit c7a20d68d2
parent 41b6e33f64
2 changed files with 210 additions and 0 deletions
--- a/haystack/document_stores/opensearch.py
+++ b/haystack/document_stores/opensearch.py
@ -618,6 +618,182 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
            query_embs, filters, top_k, index, return_embedding, headers, scale_score
        )

+    def query(
+        self,
+        query: Optional[str],
+        filters: Optional[FilterType] = None,
+        top_k: int = 10,
+        custom_query: Optional[str] = None,
+        index: Optional[str] = None,
+        headers: Optional[Dict[str, str]] = None,
+        all_terms_must_match: bool = False,
+        scale_score: bool = True,
+    ) -> List[Document]:
+        """
+        Scan through documents in DocumentStore and return a small number documents
+        that are most relevant to the query as defined by the BM25 algorithm.
+
+        :param query: The query
+        :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+                        conditions.
+                        Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+                        operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+                        `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+                        Logical operator keys take a dictionary of metadata field names and/or logical operators as
+                        value. Metadata field names take a dictionary of comparison operators as value. Comparison
+                        operator keys take a single value or (in case of `"$in"`) a list of values as value.
+                        If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+                        operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+                        operation.
+
+                            __Example__:
+
+                            ```python
+                            filters = {
+                                "$and": {
+                                    "type": {"$eq": "article"},
+                                    "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+                                    "rating": {"$gte": 3},
+                                    "$or": {
+                                        "genre": {"$in": ["economy", "politics"]},
+                                        "publisher": {"$eq": "nytimes"}
+                                    }
+                                }
+                            }
+                            # or simpler using default operators
+                            filters = {
+                                "type": "article",
+                                "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+                                "rating": {"$gte": 3},
+                                "$or": {
+                                    "genre": ["economy", "politics"],
+                                    "publisher": "nytimes"
+                                }
+                            }
+                            ```
+
+                            To use the same logical operator multiple times on the same level, logical operators take
+                            optionally a list of dictionaries as value.
+
+                            __Example__:
+
+                            ```python
+                            filters = {
+                                "$or": [
+                                    {
+                                        "$and": {
+                                            "Type": "News Paper",
+                                            "Date": {
+                                                "$lt": "2019-01-01"
+                                            }
+                                        }
+                                    },
+                                    {
+                                        "$and": {
+                                            "Type": "Blog Post",
+                                            "Date": {
+                                                "$gte": "2019-01-01"
+                                            }
+                                        }
+                                    }
+                                ]
+                            }
+                            ```
+        :param top_k: How many documents to return per query.
+        :param custom_query: query string containing a mandatory `${query}` placeholder.
+
+                             Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
+                             that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..)
+                             names must match with the filters dict supplied in self.retrieve().
+                             ::
+
+                                 **An example custom_query:**
+                                ```python
+                                {
+                                    "size": 10,
+                                    "query": {
+                                        "bool": {
+                                            "should": [{"multi_match": {
+                                                "query": ${query},                 // mandatory query placeholder
+                                                "type": "most_fields",
+                                                "fields": ["content", "title"]}}],
+                                            "filter": [                                 // optional custom filters
+                                                {"terms": {"year": ${years}}},
+                                                {"terms": {"quarter": ${quarters}}},
+                                                {"range": {"date": {"gte": ${date}}}}
+                                                ],
+                                        }
+                                    },
+                                }
+                                 ```
+
+                                **For this custom_query, a sample retrieve() could be:**
+                                ```python
+                                self.retrieve(query="Why did the revenue increase?",
+                                              filters={"years": ["2019"], "quarters": ["Q1", "Q2"]})
+                                ```
+
+                             Optionally, highlighting can be defined by specifying the highlight settings.
+                             See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html.
+                             You will find the highlighted output in the returned Document's meta field by key "highlighted".
+                             ::
+
+                                 **Example custom_query with highlighting:**
+                                ```python
+                                {
+                                    "size": 10,
+                                    "query": {
+                                        "bool": {
+                                            "should": [{"multi_match": {
+                                                "query": ${query},                 // mandatory query placeholder
+                                                "type": "most_fields",
+                                                "fields": ["content", "title"]}}],
+                                        }
+                                    },
+                                    "highlight": {             // enable highlighting
+                                        "fields": {            // for fields content and title
+                                            "content": {},
+                                            "title": {}
+                                        }
+                                    },
+                                }
+                                 ```
+
+                                 **For this custom_query, highlighting info can be accessed by:**
+                                ```python
+                                docs = self.retrieve(query="Why did the revenue increase?")
+                                highlighted_content = docs[0].meta["highlighted"]["content"]
+                                highlighted_title = docs[0].meta["highlighted"]["title"]
+                                ```
+
+        :param index: The name of the index in the DocumentStore from which to retrieve documents
+        :param headers: Custom HTTP headers to pass to the client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+                Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+        :param all_terms_must_match: Whether all terms of the query must match the document.
+                                     If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+                                     Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+                                     Defaults to false.
+        :param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]).
+                            If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+                            Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+        """
+
+        if index is None:
+            index = self.index
+
+        body = self._construct_query_body(
+            query=query,
+            filters=filters,
+            top_k=top_k,
+            custom_query=custom_query,
+            all_terms_must_match=all_terms_must_match,
+        )
+
+        result = self.client.search(index=index, body=body, headers=headers)["hits"]["hits"]
+
+        documents = [self._convert_es_hit_to_document(hit, scale_score=scale_score) for hit in result]
+        return documents
+
    def _construct_dense_query_body(
        self, query_emb: np.ndarray, return_embedding: bool, filters: Optional[FilterType] = None, top_k: int = 10
    ):
--- a/test/document_stores/test_opensearch.py
+++ b/test/document_stores/test_opensearch.py
@ -392,6 +392,40 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc
        _, kwargs = mocked_open_search_init.call_args
        assert "http_auth" not in kwargs

+    @pytest.mark.unit
+    def test_query(self, mocked_document_store):
+        mocked_document_store.query(query=self.query)
+        kwargs = mocked_document_store.client.search.call_args.kwargs
+        assert "index" in kwargs
+        assert "body" in kwargs
+        assert "headers" in kwargs
+
+    @pytest.mark.unit
+    def test_query_return_embedding_false(self, mocked_document_store):
+        mocked_document_store.return_embedding = False
+        mocked_document_store.query(self.query)
+        # assert the resulting body is consistent with the `excluded_meta_data` value
+        _, kwargs = mocked_document_store.client.search.call_args
+        assert kwargs["body"]["_source"] == {"excludes": ["embedding"]}
+
+    @pytest.mark.unit
+    def test_query_excluded_meta_data_return_embedding_true(self, mocked_document_store):
+        mocked_document_store.return_embedding = True
+        mocked_document_store.excluded_meta_data = ["foo", "embedding"]
+        mocked_document_store.query(self.query)
+        _, kwargs = mocked_document_store.client.search.call_args
+        # we expect "embedding" was removed from the final query
+        assert kwargs["body"]["_source"] == {"excludes": ["foo"]}
+
+    @pytest.mark.unit
+    def test_query_excluded_meta_data_return_embedding_false(self, mocked_document_store):
+        mocked_document_store.return_embedding = False
+        mocked_document_store.excluded_meta_data = ["foo"]
+        mocked_document_store.query(self.query)
+        # assert the resulting body is consistent with the `excluded_meta_data` value
+        _, kwargs = mocked_document_store.client.search.call_args
+        assert kwargs["body"]["_source"] == {"excludes": ["foo", "embedding"]}
+
    @pytest.mark.unit
    def test_query_by_embedding_raises_if_missing_field(self, mocked_document_store):
        mocked_document_store.embedding_field = ""