fix: Add separate query method for OpenSearchDocumentStore (#4764)

* Add separate query method for OpenSearchDocumentStore * Convert integration test to unit test + add separate tests for OpenSearch
2025-10-23 05:48:42 +00:00 · 2023-04-26 21:58:33 +02:00 · 2023-04-26 21:58:33 +02:00 · c7a20d68d2
commit c7a20d68d2
parent 41b6e33f64
2 changed files with 210 additions and 0 deletions
--- a/haystack/document_stores/opensearch.py
+++ b/haystack/document_stores/opensearch.py
@ -618,6 +618,182 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
            query_embs, filters, top_k, index, return_embedding, headers, scale_score
        )
    def query(
        self,
        query: Optional[str],
        filters: Optional[FilterType] = None,
        top_k: int = 10,
        custom_query: Optional[str] = None,
        index: Optional[str] = None,
        headers: Optional[Dict[str, str]] = None,
        all_terms_must_match: bool = False,
        scale_score: bool = True,
    ) -> List[Document]:
        """
        Scan through documents in DocumentStore and return a small number documents
        that are most relevant to the query as defined by the BM25 algorithm.
        :param query: The query
        :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
                        conditions.
                        Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
                        operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
                        `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
                        Logical operator keys take a dictionary of metadata field names and/or logical operators as
                        value. Metadata field names take a dictionary of comparison operators as value. Comparison
                        operator keys take a single value or (in case of `"$in"`) a list of values as value.
                        If no logical operator is provided, `"$and"` is used as default operation. If no comparison
                        operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
                        operation.
                            __Example__:
                            ```python
                            filters = {
                                "$and": {
                                    "type": {"$eq": "article"},
                                    "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
                                    "rating": {"$gte": 3},
                                    "$or": {
                                        "genre": {"$in": ["economy", "politics"]},
                                        "publisher": {"$eq": "nytimes"}
                                    }
                                }
                            }
                            # or simpler using default operators
                            filters = {
                                "type": "article",
                                "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
                                "rating": {"$gte": 3},
                                "$or": {
                                    "genre": ["economy", "politics"],
                                    "publisher": "nytimes"
                                }
                            }
                            ```
                            To use the same logical operator multiple times on the same level, logical operators take
                            optionally a list of dictionaries as value.
                            __Example__:
                            ```python
                            filters = {
                                "$or": [
                                    {
                                        "$and": {
                                            "Type": "News Paper",
                                            "Date": {
                                                "$lt": "2019-01-01"
                                            }
                                        }
                                    },
                                    {
                                        "$and": {
                                            "Type": "Blog Post",
                                            "Date": {
                                                "$gte": "2019-01-01"
                                            }
                                        }
                                    }
                                ]
                            }
                            ```
        :param top_k: How many documents to return per query.
        :param custom_query: query string containing a mandatory `${query}` placeholder.
                             Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
                             that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..)
                             names must match with the filters dict supplied in self.retrieve().
                             ::
                                 **An example custom_query:**
                                ```python
                                {
                                    "size": 10,
                                    "query": {
                                        "bool": {
                                            "should": [{"multi_match": {
                                                "query": ${query},                 // mandatory query placeholder
                                                "type": "most_fields",
                                                "fields": ["content", "title"]}}],
                                            "filter": [                                 // optional custom filters
                                                {"terms": {"year": ${years}}},
                                                {"terms": {"quarter": ${quarters}}},
                                                {"range": {"date": {"gte": ${date}}}}
                                                ],
                                        }
                                    },
                                }
                                 ```
                                **For this custom_query, a sample retrieve() could be:**
                                ```python
                                self.retrieve(query="Why did the revenue increase?",
                                              filters={"years": ["2019"], "quarters": ["Q1", "Q2"]})
                                ```
                             Optionally, highlighting can be defined by specifying the highlight settings.
                             See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html.
                             You will find the highlighted output in the returned Document's meta field by key "highlighted".
                             ::
                                 **Example custom_query with highlighting:**
                                ```python
                                {
                                    "size": 10,
                                    "query": {
                                        "bool": {
                                            "should": [{"multi_match": {
                                                "query": ${query},                 // mandatory query placeholder
                                                "type": "most_fields",
                                                "fields": ["content", "title"]}}],
                                        }
                                    },
                                    "highlight": {             // enable highlighting
                                        "fields": {            // for fields content and title
                                            "content": {},
                                            "title": {}
                                        }
                                    },
                                }
                                 ```
                                 **For this custom_query, highlighting info can be accessed by:**
                                ```python
                                docs = self.retrieve(query="Why did the revenue increase?")
                                highlighted_content = docs[0].meta["highlighted"]["content"]
                                highlighted_title = docs[0].meta["highlighted"]["title"]
                                ```
        :param index: The name of the index in the DocumentStore from which to retrieve documents
        :param headers: Custom HTTP headers to pass to the client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
                Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
        :param all_terms_must_match: Whether all terms of the query must match the document.
                                     If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
                                     Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
                                     Defaults to false.
        :param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]).
                            If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
                            Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
        """
        if index is None:
            index = self.index
        body = self._construct_query_body(
            query=query,
            filters=filters,
            top_k=top_k,
            custom_query=custom_query,
            all_terms_must_match=all_terms_must_match,
        )
        result = self.client.search(index=index, body=body, headers=headers)["hits"]["hits"]
        documents = [self._convert_es_hit_to_document(hit, scale_score=scale_score) for hit in result]
        return documents
    def _construct_dense_query_body(
        self, query_emb: np.ndarray, return_embedding: bool, filters: Optional[FilterType] = None, top_k: int = 10
    ):
--- a/test/document_stores/test_opensearch.py
+++ b/test/document_stores/test_opensearch.py
@ -392,6 +392,40 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc
        _, kwargs = mocked_open_search_init.call_args
        assert "http_auth" not in kwargs
    @pytest.mark.unit
    def test_query(self, mocked_document_store):
        mocked_document_store.query(query=self.query)
        kwargs = mocked_document_store.client.search.call_args.kwargs
        assert "index" in kwargs
        assert "body" in kwargs
        assert "headers" in kwargs
    @pytest.mark.unit
    def test_query_return_embedding_false(self, mocked_document_store):
        mocked_document_store.return_embedding = False
        mocked_document_store.query(self.query)
        # assert the resulting body is consistent with the `excluded_meta_data` value
        _, kwargs = mocked_document_store.client.search.call_args
        assert kwargs["body"]["_source"] == {"excludes": ["embedding"]}
    @pytest.mark.unit
    def test_query_excluded_meta_data_return_embedding_true(self, mocked_document_store):
        mocked_document_store.return_embedding = True
        mocked_document_store.excluded_meta_data = ["foo", "embedding"]
        mocked_document_store.query(self.query)
        _, kwargs = mocked_document_store.client.search.call_args
        # we expect "embedding" was removed from the final query
        assert kwargs["body"]["_source"] == {"excludes": ["foo"]}
    @pytest.mark.unit
    def test_query_excluded_meta_data_return_embedding_false(self, mocked_document_store):
        mocked_document_store.return_embedding = False
        mocked_document_store.excluded_meta_data = ["foo"]
        mocked_document_store.query(self.query)
        # assert the resulting body is consistent with the `excluded_meta_data` value
        _, kwargs = mocked_document_store.client.search.call_args
        assert kwargs["body"]["_source"] == {"excludes": ["foo", "embedding"]}
    @pytest.mark.unit
    def test_query_by_embedding_raises_if_missing_field(self, mocked_document_store):
        mocked_document_store.embedding_field = ""