From 3e0ef1cc8a6e6cef44056fd18f4e94089fc90311 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 3 Jan 2022 17:14:51 +0100 Subject: [PATCH] Fix Numba TypingError in `normalize_embedding` for cosine similarity (#1933) * Fix Numba TypingError * Add latest docstring and tutorial changes Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/_src/api/api/document_store.md | 4 +++- haystack/document_stores/base.py | 15 ++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 9d0009c0b..8464bc47b 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -130,11 +130,13 @@ object, provided that they have the same product_id (to be found in Label.meta[" #### normalize\_embedding ```python + | @staticmethod | @njit | normalize_embedding(emb: np.ndarray) -> None ``` -Performs L2 normalization of embeddings vector inplace. Input can be a single vector (1D array) or a matrix (2D array). +Performs L2 normalization of embeddings vector inplace. Input can be a single vector (1D array) or a matrix +(2D array). #### add\_eval\_data diff --git a/haystack/document_stores/base.py b/haystack/document_stores/base.py index 9526abe5a..93add2520 100644 --- a/haystack/document_stores/base.py +++ b/haystack/document_stores/base.py @@ -232,22 +232,27 @@ class BaseDocumentStore(BaseComponent): headers: Optional[Dict[str, str]] = None) -> int: pass + @staticmethod @njit#(fastmath=True) - def normalize_embedding(self, emb: np.ndarray) -> None: + def normalize_embedding(emb: np.ndarray) -> None: """ - Performs L2 normalization of embeddings vector inplace. Input can be a single vector (1D array) or a matrix (2D array). + Performs L2 normalization of embeddings vector inplace. Input can be a single vector (1D array) or a matrix + (2D array). """ # Might be extended to other normalizations in future # Single vec if len(emb.shape) == 1: - norm = np.sqrt(emb.dot(emb)) #faster than np.linalg.norm() + norm = np.sqrt(emb.dot(emb)) # faster than np.linalg.norm() if norm != 0.0: emb /= norm # 2D matrix else: - norm = np.linalg.norm(emb, axis=1) - emb /= norm[:, None] + for vec in emb: + vec = np.ascontiguousarray(vec) + norm = np.sqrt(vec.dot(vec)) + if norm != 0.0: + vec /= norm def finalize_raw_score(self, raw_score: float, similarity: Optional[str]) -> float: if similarity == "cosine":