bug: removed duplicated meta "name" field addition to content before embedding in update_embeddings workflow (#3368)

* Removed explicit passage formatting by name field * passing correct input type for embedding the docs * Updated test, updated similarity scores and added results * changed expected input to embed method
2025-12-30 00:30:09 +00:00 · 2022-10-25 11:22:05 +02:00 · 2022-10-25 11:22:05 +02:00 · d48577b4e7
commit d48577b4e7
parent 1b9586ae40
2 changed files with 11 additions and 4 deletions
--- a/haystack/nodes/retriever/_embedding_encoder.py
+++ b/haystack/nodes/retriever/_embedding_encoder.py
@ -190,8 +190,8 @@ class _SentenceTransformersEmbeddingEncoder(_BaseEmbeddingEncoder):
                f"This can be set when initializing the DocumentStore"
            )

-    def embed(self, texts: Union[List[List[str]], List[str], str]) -> np.ndarray:
-        # texts can be a list of strings or a list of [title, text]
+    def embed(self, texts: Union[List[str], str]) -> np.ndarray:
+        # texts can be a list of strings
        # get back list of numpy embedding vectors
        emb = self.embedding_model.encode(
            texts, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
@ -214,7 +214,7 @@ class _SentenceTransformersEmbeddingEncoder(_BaseEmbeddingEncoder):
        :param docs: List of documents to embed.
        :return: Embeddings, one per input document, shape: (documents, embedding_dim)
        """
-        passages = [[d.meta["name"] if d.meta and "name" in d.meta else "", d.content] for d in docs]
+        passages = [d.content for d in docs]
        return self.embed(passages)

    def train(
--- a/test/document_stores/test_document_store.py
+++ b/test/document_stores/test_document_store.py
@ -1465,8 +1465,15 @@ def test_similarity_score_sentence_transformers(document_store_with_docs):
    pipeline = DocumentSearchPipeline(retriever)
    prediction = pipeline.run("Paul lives in New York")
    scores = [document.score for document in prediction["documents"]]
+    assert [document.content for document in prediction["documents"]] == [
+        "My name is Paul and I live in New York",
+        "My name is Matteo and I live in Rome",
+        "My name is Christelle and I live in Paris",
+        "My name is Carla and I live in Berlin",
+        "My name is Camila and I live in Madrid",
+    ]
    assert scores == pytest.approx(
-        [0.8497486114501953, 0.6622999012470245, 0.6077829301357269, 0.5928314849734306, 0.5614184625446796], abs=1e-3
+        [0.9149981737136841, 0.6895168423652649, 0.641706794500351, 0.6206043660640717, 0.5837393924593925], abs=1e-3
    )