enhancement: reduced usage of numpy and substituted built-in libraries (#8418)

* reduced usage of numpy and substituted built-in libraries

* added release note

* edited expit function to support both float as well as list (this case was giving error CI)

* revert code , numpy can't be removed here

* more cleaning

* fix relnote

---------

Co-authored-by: anakin87 <stefanofiorucci@gmail.com>
This commit is contained in:
Ajit Singh 2024-10-18 19:12:19 +05:30 committed by GitHub
parent ff584f1577
commit 6cf13e8b98
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 31 additions and 25 deletions

View File

@ -2,9 +2,7 @@
# #
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List, Optional, cast from typing import Any, Dict, List, Optional
import numpy as np
from haystack.lazy_imports import LazyImport from haystack.lazy_imports import LazyImport
from haystack.utils.auth import Secret from haystack.utils.auth import Secret
@ -78,5 +76,5 @@ class _SentenceTransformersEmbeddingBackend:
) )
def embed(self, data: List[str], **kwargs) -> List[List[float]]: def embed(self, data: List[str], **kwargs) -> List[List[float]]:
embeddings = cast(np.ndarray, self.model.encode(data, **kwargs)).tolist() embeddings = self.model.encode(data, **kwargs).tolist()
return embeddings return embeddings

View File

@ -5,8 +5,6 @@
import os import os
import random import random
import numpy as np
from haystack import logging from haystack import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -23,7 +21,6 @@ def set_all_seeds(seed: int, deterministic_cudnn: bool = False) -> None:
:param deterministic_cudnn: Enable for full reproducibility when using CUDA. Caution: might slow down training. :param deterministic_cudnn: Enable for full reproducibility when using CUDA. Caution: might slow down training.
""" """
random.seed(seed) random.seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed) os.environ["PYTHONHASHSEED"] = str(seed)
try: try:

View File

@ -2,9 +2,13 @@
# #
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import numpy as np from numpy import exp
def expit(x: float) -> float: def expit(x) -> float:
"""Compute logistic sigmoid function. Maps input values to a range between 0 and 1""" """
return 1 / (1 + np.exp(-x)) Compute logistic sigmoid function. Maps input values to a range between 0 and 1
:param x: input value. Can be a scalar or a numpy array.
"""
return 1 / (1 + exp(-x))

View File

@ -0,0 +1,4 @@
---
enhancements:
- |
Reduced numpy usage to speed up imports.

View File

@ -4,9 +4,9 @@
import os import os
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
import random
import pytest import pytest
from huggingface_hub.utils import RepositoryNotFoundError from huggingface_hub.utils import RepositoryNotFoundError
from numpy import array, random
from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
from haystack.dataclasses import Document from haystack.dataclasses import Document
@ -24,7 +24,7 @@ def mock_check_valid_model():
def mock_embedding_generation(json, **kwargs): def mock_embedding_generation(json, **kwargs):
response = str(array([random.rand(384) for i in range(len(json["inputs"]))]).tolist()).encode() response = str([[random.random() for _ in range(384)] for _ in range(len(json["inputs"]))]).encode()
return response return response

View File

@ -4,9 +4,9 @@
import os import os
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
import random
import pytest import pytest
from huggingface_hub.utils import RepositoryNotFoundError from huggingface_hub.utils import RepositoryNotFoundError
from numpy import array, random
from haystack.components.embedders import HuggingFaceAPITextEmbedder from haystack.components.embedders import HuggingFaceAPITextEmbedder
from haystack.utils.auth import Secret from haystack.utils.auth import Secret
@ -22,7 +22,7 @@ def mock_check_valid_model():
def mock_embedding_generation(json, **kwargs): def mock_embedding_generation(json, **kwargs):
response = str(array([random.rand(384) for i in range(len(json["inputs"]))]).tolist()).encode() response = str([[random.random() for _ in range(384)] for _ in range(len(json["inputs"]))]).encode()
return response return response

View File

@ -5,7 +5,7 @@ import os
from typing import List from typing import List
from haystack.utils.auth import Secret from haystack.utils.auth import Secret
import numpy as np import random
import pytest import pytest
from haystack import Document from haystack import Document
@ -16,7 +16,8 @@ def mock_openai_response(input: List[str], model: str = "text-embedding-ada-002"
dict_response = { dict_response = {
"object": "list", "object": "list",
"data": [ "data": [
{"object": "embedding", "index": i, "embedding": np.random.rand(1536).tolist()} for i in range(len(input)) {"object": "embedding", "index": i, "embedding": [random.random() for _ in range(1536)]}
for i in range(len(input))
], ],
"model": model, "model": model,
"usage": {"prompt_tokens": 4, "total_tokens": 4}, "usage": {"prompt_tokens": 4, "total_tokens": 4},

View File

@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
import numpy as np import random
import pytest import pytest
import torch import torch
@ -264,7 +264,9 @@ class TestSentenceTransformersDocumentEmbedder:
def test_run(self): def test_run(self):
embedder = SentenceTransformersDocumentEmbedder(model="model") embedder = SentenceTransformersDocumentEmbedder(model="model")
embedder.embedding_backend = MagicMock() embedder.embedding_backend = MagicMock()
embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() embedder.embedding_backend.embed = lambda x, **kwargs: [
[random.random() for _ in range(16)] for _ in range(len(x))
]
documents = [Document(content=f"document number {i}") for i in range(5)] documents = [Document(content=f"document number {i}") for i in range(5)]

View File

@ -4,7 +4,7 @@
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
import torch import torch
import numpy as np import random
import pytest import pytest
from haystack.components.embedders.sentence_transformers_text_embedder import SentenceTransformersTextEmbedder from haystack.components.embedders.sentence_transformers_text_embedder import SentenceTransformersTextEmbedder
@ -239,7 +239,9 @@ class TestSentenceTransformersTextEmbedder:
def test_run(self): def test_run(self):
embedder = SentenceTransformersTextEmbedder(model="model") embedder = SentenceTransformersTextEmbedder(model="model")
embedder.embedding_backend = MagicMock() embedder.embedding_backend = MagicMock()
embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() embedder.embedding_backend.embed = lambda x, **kwargs: [
[random.random() for _ in range(16)] for _ in range(len(x))
]
text = "a nice text to embed" text = "a nice text to embed"

View File

@ -5,7 +5,6 @@ import os
import math import math
from typing import List from typing import List
import numpy as np
import pytest import pytest
from haystack import Pipeline from haystack import Pipeline

View File

@ -4,7 +4,6 @@
from typing import Dict, Any from typing import Dict, Any
import pytest import pytest
import numpy as np
from haystack import Pipeline, DeserializationError from haystack import Pipeline, DeserializationError
from haystack.document_stores.types import FilterPolicy from haystack.document_stores.types import FilterPolicy
@ -135,7 +134,7 @@ class TestMemoryEmbeddingRetriever:
assert "documents" in result assert "documents" in result
assert len(result["documents"]) == top_k assert len(result["documents"]) == top_k
assert np.array_equal(result["documents"][0].embedding, [1.0, 1.0, 1.0, 1.0]) assert result["documents"][0].embedding == [1.0, 1.0, 1.0, 1.0]
def test_invalid_run_wrong_store_type(self): def test_invalid_run_wrong_store_type(self):
SomeOtherDocumentStore = document_store_class("SomeOtherDocumentStore") SomeOtherDocumentStore = document_store_class("SomeOtherDocumentStore")
@ -165,4 +164,4 @@ class TestMemoryEmbeddingRetriever:
results_docs = result["retriever"]["documents"] results_docs = result["retriever"]["documents"]
assert results_docs assert results_docs
assert len(results_docs) == top_k assert len(results_docs) == top_k
assert np.array_equal(results_docs[0].embedding, [1.0, 1.0, 1.0, 1.0]) assert results_docs[0].embedding == [1.0, 1.0, 1.0, 1.0]