From d48577b4e7cbcdf5fbc5a4cf76e5539190318efd Mon Sep 17 00:00:00 2001 From: Mayank Jobanputra Date: Tue, 25 Oct 2022 11:22:05 +0200 Subject: [PATCH] bug: removed duplicated meta "name" field addition to content before embedding in `update_embeddings` workflow (#3368) * Removed explicit passage formatting by name field * passing correct input type for embedding the docs * Updated test, updated similarity scores and added results * changed expected input to embed method --- haystack/nodes/retriever/_embedding_encoder.py | 6 +++--- test/document_stores/test_document_store.py | 9 ++++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/haystack/nodes/retriever/_embedding_encoder.py b/haystack/nodes/retriever/_embedding_encoder.py index 95078debf..a0612d6f4 100644 --- a/haystack/nodes/retriever/_embedding_encoder.py +++ b/haystack/nodes/retriever/_embedding_encoder.py @@ -190,8 +190,8 @@ class _SentenceTransformersEmbeddingEncoder(_BaseEmbeddingEncoder): f"This can be set when initializing the DocumentStore" ) - def embed(self, texts: Union[List[List[str]], List[str], str]) -> np.ndarray: - # texts can be a list of strings or a list of [title, text] + def embed(self, texts: Union[List[str], str]) -> np.ndarray: + # texts can be a list of strings # get back list of numpy embedding vectors emb = self.embedding_model.encode( texts, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True @@ -214,7 +214,7 @@ class _SentenceTransformersEmbeddingEncoder(_BaseEmbeddingEncoder): :param docs: List of documents to embed. :return: Embeddings, one per input document, shape: (documents, embedding_dim) """ - passages = [[d.meta["name"] if d.meta and "name" in d.meta else "", d.content] for d in docs] + passages = [d.content for d in docs] return self.embed(passages) def train( diff --git a/test/document_stores/test_document_store.py b/test/document_stores/test_document_store.py index 8734f9dcd..7232fb619 100644 --- a/test/document_stores/test_document_store.py +++ b/test/document_stores/test_document_store.py @@ -1465,8 +1465,15 @@ def test_similarity_score_sentence_transformers(document_store_with_docs): pipeline = DocumentSearchPipeline(retriever) prediction = pipeline.run("Paul lives in New York") scores = [document.score for document in prediction["documents"]] + assert [document.content for document in prediction["documents"]] == [ + "My name is Paul and I live in New York", + "My name is Matteo and I live in Rome", + "My name is Christelle and I live in Paris", + "My name is Carla and I live in Berlin", + "My name is Camila and I live in Madrid", + ] assert scores == pytest.approx( - [0.8497486114501953, 0.6622999012470245, 0.6077829301357269, 0.5928314849734306, 0.5614184625446796], abs=1e-3 + [0.9149981737136841, 0.6895168423652649, 0.641706794500351, 0.6206043660640717, 0.5837393924593925], abs=1e-3 )