enhancement: reduced usage of numpy and substituted built-in libraries (#8418)

* reduced usage of numpy and substituted built-in libraries

* added release note

* edited expit function to support both float as well as list (this case was giving error CI)

* revert code , numpy can't be removed here

* more cleaning

* fix relnote

---------

Co-authored-by: anakin87 <stefanofiorucci@gmail.com>
This commit is contained in:
Ajit Singh 2024-10-18 19:12:19 +05:30 committed by GitHub
parent ff584f1577
commit 6cf13e8b98
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 31 additions and 25 deletions

View File

@ -2,9 +2,7 @@
#
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List, Optional, cast
import numpy as np
from typing import Any, Dict, List, Optional
from haystack.lazy_imports import LazyImport
from haystack.utils.auth import Secret
@ -78,5 +76,5 @@ class _SentenceTransformersEmbeddingBackend:
)
def embed(self, data: List[str], **kwargs) -> List[List[float]]:
embeddings = cast(np.ndarray, self.model.encode(data, **kwargs)).tolist()
embeddings = self.model.encode(data, **kwargs).tolist()
return embeddings

View File

@ -5,8 +5,6 @@
import os
import random
import numpy as np
from haystack import logging
logger = logging.getLogger(__name__)
@ -23,7 +21,6 @@ def set_all_seeds(seed: int, deterministic_cudnn: bool = False) -> None:
:param deterministic_cudnn: Enable for full reproducibility when using CUDA. Caution: might slow down training.
"""
random.seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
try:

View File

@ -2,9 +2,13 @@
#
# SPDX-License-Identifier: Apache-2.0
import numpy as np
from numpy import exp
def expit(x: float) -> float:
"""Compute logistic sigmoid function. Maps input values to a range between 0 and 1"""
return 1 / (1 + np.exp(-x))
def expit(x) -> float:
"""
Compute logistic sigmoid function. Maps input values to a range between 0 and 1
:param x: input value. Can be a scalar or a numpy array.
"""
return 1 / (1 + exp(-x))

View File

@ -0,0 +1,4 @@
---
enhancements:
- |
Reduced numpy usage to speed up imports.

View File

@ -4,9 +4,9 @@
import os
from unittest.mock import MagicMock, patch
import random
import pytest
from huggingface_hub.utils import RepositoryNotFoundError
from numpy import array, random
from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
from haystack.dataclasses import Document
@ -24,7 +24,7 @@ def mock_check_valid_model():
def mock_embedding_generation(json, **kwargs):
response = str(array([random.rand(384) for i in range(len(json["inputs"]))]).tolist()).encode()
response = str([[random.random() for _ in range(384)] for _ in range(len(json["inputs"]))]).encode()
return response

View File

@ -4,9 +4,9 @@
import os
from unittest.mock import MagicMock, patch
import random
import pytest
from huggingface_hub.utils import RepositoryNotFoundError
from numpy import array, random
from haystack.components.embedders import HuggingFaceAPITextEmbedder
from haystack.utils.auth import Secret
@ -22,7 +22,7 @@ def mock_check_valid_model():
def mock_embedding_generation(json, **kwargs):
response = str(array([random.rand(384) for i in range(len(json["inputs"]))]).tolist()).encode()
response = str([[random.random() for _ in range(384)] for _ in range(len(json["inputs"]))]).encode()
return response

View File

@ -5,7 +5,7 @@ import os
from typing import List
from haystack.utils.auth import Secret
import numpy as np
import random
import pytest
from haystack import Document
@ -16,7 +16,8 @@ def mock_openai_response(input: List[str], model: str = "text-embedding-ada-002"
dict_response = {
"object": "list",
"data": [
{"object": "embedding", "index": i, "embedding": np.random.rand(1536).tolist()} for i in range(len(input))
{"object": "embedding", "index": i, "embedding": [random.random() for _ in range(1536)]}
for i in range(len(input))
],
"model": model,
"usage": {"prompt_tokens": 4, "total_tokens": 4},

View File

@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
from unittest.mock import MagicMock, patch
import numpy as np
import random
import pytest
import torch
@ -264,7 +264,9 @@ class TestSentenceTransformersDocumentEmbedder:
def test_run(self):
embedder = SentenceTransformersDocumentEmbedder(model="model")
embedder.embedding_backend = MagicMock()
embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist()
embedder.embedding_backend.embed = lambda x, **kwargs: [
[random.random() for _ in range(16)] for _ in range(len(x))
]
documents = [Document(content=f"document number {i}") for i in range(5)]

View File

@ -4,7 +4,7 @@
from unittest.mock import MagicMock, patch
import torch
import numpy as np
import random
import pytest
from haystack.components.embedders.sentence_transformers_text_embedder import SentenceTransformersTextEmbedder
@ -239,7 +239,9 @@ class TestSentenceTransformersTextEmbedder:
def test_run(self):
embedder = SentenceTransformersTextEmbedder(model="model")
embedder.embedding_backend = MagicMock()
embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist()
embedder.embedding_backend.embed = lambda x, **kwargs: [
[random.random() for _ in range(16)] for _ in range(len(x))
]
text = "a nice text to embed"

View File

@ -5,7 +5,6 @@ import os
import math
from typing import List
import numpy as np
import pytest
from haystack import Pipeline

View File

@ -4,7 +4,6 @@
from typing import Dict, Any
import pytest
import numpy as np
from haystack import Pipeline, DeserializationError
from haystack.document_stores.types import FilterPolicy
@ -135,7 +134,7 @@ class TestMemoryEmbeddingRetriever:
assert "documents" in result
assert len(result["documents"]) == top_k
assert np.array_equal(result["documents"][0].embedding, [1.0, 1.0, 1.0, 1.0])
assert result["documents"][0].embedding == [1.0, 1.0, 1.0, 1.0]
def test_invalid_run_wrong_store_type(self):
SomeOtherDocumentStore = document_store_class("SomeOtherDocumentStore")
@ -165,4 +164,4 @@ class TestMemoryEmbeddingRetriever:
results_docs = result["retriever"]["documents"]
assert results_docs
assert len(results_docs) == top_k
assert np.array_equal(results_docs[0].embedding, [1.0, 1.0, 1.0, 1.0])
assert results_docs[0].embedding == [1.0, 1.0, 1.0, 1.0]