mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-26 23:15:59 +00:00
enhancement: reduced usage of numpy and substituted built-in libraries (#8418)
* reduced usage of numpy and substituted built-in libraries * added release note * edited expit function to support both float as well as list (this case was giving error CI) * revert code , numpy can't be removed here * more cleaning * fix relnote --------- Co-authored-by: anakin87 <stefanofiorucci@gmail.com>
This commit is contained in:
parent
ff584f1577
commit
6cf13e8b98
@ -2,9 +2,7 @@
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import Any, Dict, List, Optional, cast
|
||||
|
||||
import numpy as np
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from haystack.lazy_imports import LazyImport
|
||||
from haystack.utils.auth import Secret
|
||||
@ -78,5 +76,5 @@ class _SentenceTransformersEmbeddingBackend:
|
||||
)
|
||||
|
||||
def embed(self, data: List[str], **kwargs) -> List[List[float]]:
|
||||
embeddings = cast(np.ndarray, self.model.encode(data, **kwargs)).tolist()
|
||||
embeddings = self.model.encode(data, **kwargs).tolist()
|
||||
return embeddings
|
||||
|
||||
@ -5,8 +5,6 @@
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
|
||||
from haystack import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -23,7 +21,6 @@ def set_all_seeds(seed: int, deterministic_cudnn: bool = False) -> None:
|
||||
:param deterministic_cudnn: Enable for full reproducibility when using CUDA. Caution: might slow down training.
|
||||
"""
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
os.environ["PYTHONHASHSEED"] = str(seed)
|
||||
|
||||
try:
|
||||
|
||||
@ -2,9 +2,13 @@
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import numpy as np
|
||||
from numpy import exp
|
||||
|
||||
|
||||
def expit(x: float) -> float:
|
||||
"""Compute logistic sigmoid function. Maps input values to a range between 0 and 1"""
|
||||
return 1 / (1 + np.exp(-x))
|
||||
def expit(x) -> float:
|
||||
"""
|
||||
Compute logistic sigmoid function. Maps input values to a range between 0 and 1
|
||||
|
||||
:param x: input value. Can be a scalar or a numpy array.
|
||||
"""
|
||||
return 1 / (1 + exp(-x))
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
enhancements:
|
||||
- |
|
||||
Reduced numpy usage to speed up imports.
|
||||
@ -4,9 +4,9 @@
|
||||
import os
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import random
|
||||
import pytest
|
||||
from huggingface_hub.utils import RepositoryNotFoundError
|
||||
from numpy import array, random
|
||||
|
||||
from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
|
||||
from haystack.dataclasses import Document
|
||||
@ -24,7 +24,7 @@ def mock_check_valid_model():
|
||||
|
||||
|
||||
def mock_embedding_generation(json, **kwargs):
|
||||
response = str(array([random.rand(384) for i in range(len(json["inputs"]))]).tolist()).encode()
|
||||
response = str([[random.random() for _ in range(384)] for _ in range(len(json["inputs"]))]).encode()
|
||||
return response
|
||||
|
||||
|
||||
|
||||
@ -4,9 +4,9 @@
|
||||
import os
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import random
|
||||
import pytest
|
||||
from huggingface_hub.utils import RepositoryNotFoundError
|
||||
from numpy import array, random
|
||||
|
||||
from haystack.components.embedders import HuggingFaceAPITextEmbedder
|
||||
from haystack.utils.auth import Secret
|
||||
@ -22,7 +22,7 @@ def mock_check_valid_model():
|
||||
|
||||
|
||||
def mock_embedding_generation(json, **kwargs):
|
||||
response = str(array([random.rand(384) for i in range(len(json["inputs"]))]).tolist()).encode()
|
||||
response = str([[random.random() for _ in range(384)] for _ in range(len(json["inputs"]))]).encode()
|
||||
return response
|
||||
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@ import os
|
||||
from typing import List
|
||||
from haystack.utils.auth import Secret
|
||||
|
||||
import numpy as np
|
||||
import random
|
||||
import pytest
|
||||
|
||||
from haystack import Document
|
||||
@ -16,7 +16,8 @@ def mock_openai_response(input: List[str], model: str = "text-embedding-ada-002"
|
||||
dict_response = {
|
||||
"object": "list",
|
||||
"data": [
|
||||
{"object": "embedding", "index": i, "embedding": np.random.rand(1536).tolist()} for i in range(len(input))
|
||||
{"object": "embedding", "index": i, "embedding": [random.random() for _ in range(1536)]}
|
||||
for i in range(len(input))
|
||||
],
|
||||
"model": model,
|
||||
"usage": {"prompt_tokens": 4, "total_tokens": 4},
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import numpy as np
|
||||
import random
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
@ -264,7 +264,9 @@ class TestSentenceTransformersDocumentEmbedder:
|
||||
def test_run(self):
|
||||
embedder = SentenceTransformersDocumentEmbedder(model="model")
|
||||
embedder.embedding_backend = MagicMock()
|
||||
embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist()
|
||||
embedder.embedding_backend.embed = lambda x, **kwargs: [
|
||||
[random.random() for _ in range(16)] for _ in range(len(x))
|
||||
]
|
||||
|
||||
documents = [Document(content=f"document number {i}") for i in range(5)]
|
||||
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
import random
|
||||
import pytest
|
||||
|
||||
from haystack.components.embedders.sentence_transformers_text_embedder import SentenceTransformersTextEmbedder
|
||||
@ -239,7 +239,9 @@ class TestSentenceTransformersTextEmbedder:
|
||||
def test_run(self):
|
||||
embedder = SentenceTransformersTextEmbedder(model="model")
|
||||
embedder.embedding_backend = MagicMock()
|
||||
embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist()
|
||||
embedder.embedding_backend.embed = lambda x, **kwargs: [
|
||||
[random.random() for _ in range(16)] for _ in range(len(x))
|
||||
]
|
||||
|
||||
text = "a nice text to embed"
|
||||
|
||||
|
||||
@ -5,7 +5,6 @@ import os
|
||||
import math
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from haystack import Pipeline
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
from typing import Dict, Any
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
from haystack import Pipeline, DeserializationError
|
||||
from haystack.document_stores.types import FilterPolicy
|
||||
@ -135,7 +134,7 @@ class TestMemoryEmbeddingRetriever:
|
||||
|
||||
assert "documents" in result
|
||||
assert len(result["documents"]) == top_k
|
||||
assert np.array_equal(result["documents"][0].embedding, [1.0, 1.0, 1.0, 1.0])
|
||||
assert result["documents"][0].embedding == [1.0, 1.0, 1.0, 1.0]
|
||||
|
||||
def test_invalid_run_wrong_store_type(self):
|
||||
SomeOtherDocumentStore = document_store_class("SomeOtherDocumentStore")
|
||||
@ -165,4 +164,4 @@ class TestMemoryEmbeddingRetriever:
|
||||
results_docs = result["retriever"]["documents"]
|
||||
assert results_docs
|
||||
assert len(results_docs) == top_k
|
||||
assert np.array_equal(results_docs[0].embedding, [1.0, 1.0, 1.0, 1.0])
|
||||
assert results_docs[0].embedding == [1.0, 1.0, 1.0, 1.0]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user