bug: removed duplicated meta "name" field addition to content before embedding in update_embeddings workflow (#3368)

* Removed explicit passage formatting by name field

* passing correct input type for embedding the docs

* Updated test, updated similarity scores and added results

* changed expected input to embed method
This commit is contained in:
Mayank Jobanputra 2022-10-25 11:22:05 +02:00 committed by GitHub
parent 1b9586ae40
commit d48577b4e7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 11 additions and 4 deletions

View File

@ -190,8 +190,8 @@ class _SentenceTransformersEmbeddingEncoder(_BaseEmbeddingEncoder):
f"This can be set when initializing the DocumentStore"
)
def embed(self, texts: Union[List[List[str]], List[str], str]) -> np.ndarray:
# texts can be a list of strings or a list of [title, text]
def embed(self, texts: Union[List[str], str]) -> np.ndarray:
# texts can be a list of strings
# get back list of numpy embedding vectors
emb = self.embedding_model.encode(
texts, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
@ -214,7 +214,7 @@ class _SentenceTransformersEmbeddingEncoder(_BaseEmbeddingEncoder):
:param docs: List of documents to embed.
:return: Embeddings, one per input document, shape: (documents, embedding_dim)
"""
passages = [[d.meta["name"] if d.meta and "name" in d.meta else "", d.content] for d in docs]
passages = [d.content for d in docs]
return self.embed(passages)
def train(

View File

@ -1465,8 +1465,15 @@ def test_similarity_score_sentence_transformers(document_store_with_docs):
pipeline = DocumentSearchPipeline(retriever)
prediction = pipeline.run("Paul lives in New York")
scores = [document.score for document in prediction["documents"]]
assert [document.content for document in prediction["documents"]] == [
"My name is Paul and I live in New York",
"My name is Matteo and I live in Rome",
"My name is Christelle and I live in Paris",
"My name is Carla and I live in Berlin",
"My name is Camila and I live in Madrid",
]
assert scores == pytest.approx(
[0.8497486114501953, 0.6622999012470245, 0.6077829301357269, 0.5928314849734306, 0.5614184625446796], abs=1e-3
[0.9149981737136841, 0.6895168423652649, 0.641706794500351, 0.6206043660640717, 0.5837393924593925], abs=1e-3
)