mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-29 07:59:27 +00:00
bug: removed duplicated meta "name" field addition to content before embedding in update_embeddings workflow (#3368)
* Removed explicit passage formatting by name field * passing correct input type for embedding the docs * Updated test, updated similarity scores and added results * changed expected input to embed method
This commit is contained in:
parent
1b9586ae40
commit
d48577b4e7
@ -190,8 +190,8 @@ class _SentenceTransformersEmbeddingEncoder(_BaseEmbeddingEncoder):
|
||||
f"This can be set when initializing the DocumentStore"
|
||||
)
|
||||
|
||||
def embed(self, texts: Union[List[List[str]], List[str], str]) -> np.ndarray:
|
||||
# texts can be a list of strings or a list of [title, text]
|
||||
def embed(self, texts: Union[List[str], str]) -> np.ndarray:
|
||||
# texts can be a list of strings
|
||||
# get back list of numpy embedding vectors
|
||||
emb = self.embedding_model.encode(
|
||||
texts, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
|
||||
@ -214,7 +214,7 @@ class _SentenceTransformersEmbeddingEncoder(_BaseEmbeddingEncoder):
|
||||
:param docs: List of documents to embed.
|
||||
:return: Embeddings, one per input document, shape: (documents, embedding_dim)
|
||||
"""
|
||||
passages = [[d.meta["name"] if d.meta and "name" in d.meta else "", d.content] for d in docs]
|
||||
passages = [d.content for d in docs]
|
||||
return self.embed(passages)
|
||||
|
||||
def train(
|
||||
|
||||
@ -1465,8 +1465,15 @@ def test_similarity_score_sentence_transformers(document_store_with_docs):
|
||||
pipeline = DocumentSearchPipeline(retriever)
|
||||
prediction = pipeline.run("Paul lives in New York")
|
||||
scores = [document.score for document in prediction["documents"]]
|
||||
assert [document.content for document in prediction["documents"]] == [
|
||||
"My name is Paul and I live in New York",
|
||||
"My name is Matteo and I live in Rome",
|
||||
"My name is Christelle and I live in Paris",
|
||||
"My name is Carla and I live in Berlin",
|
||||
"My name is Camila and I live in Madrid",
|
||||
]
|
||||
assert scores == pytest.approx(
|
||||
[0.8497486114501953, 0.6622999012470245, 0.6077829301357269, 0.5928314849734306, 0.5614184625446796], abs=1e-3
|
||||
[0.9149981737136841, 0.6895168423652649, 0.641706794500351, 0.6206043660640717, 0.5837393924593925], abs=1e-3
|
||||
)
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user