mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-27 07:26:33 +00:00
enhancement: reduced usage of numpy and substituted built-in libraries (#8418)
* reduced usage of numpy and substituted built-in libraries * added release note * edited expit function to support both float as well as list (this case was giving error CI) * revert code , numpy can't be removed here * more cleaning * fix relnote --------- Co-authored-by: anakin87 <stefanofiorucci@gmail.com>
This commit is contained in:
parent
ff584f1577
commit
6cf13e8b98
@ -2,9 +2,7 @@
|
|||||||
#
|
#
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
from typing import Any, Dict, List, Optional, cast
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from haystack.lazy_imports import LazyImport
|
from haystack.lazy_imports import LazyImport
|
||||||
from haystack.utils.auth import Secret
|
from haystack.utils.auth import Secret
|
||||||
@ -78,5 +76,5 @@ class _SentenceTransformersEmbeddingBackend:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def embed(self, data: List[str], **kwargs) -> List[List[float]]:
|
def embed(self, data: List[str], **kwargs) -> List[List[float]]:
|
||||||
embeddings = cast(np.ndarray, self.model.encode(data, **kwargs)).tolist()
|
embeddings = self.model.encode(data, **kwargs).tolist()
|
||||||
return embeddings
|
return embeddings
|
||||||
|
|||||||
@ -5,8 +5,6 @@
|
|||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from haystack import logging
|
from haystack import logging
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -23,7 +21,6 @@ def set_all_seeds(seed: int, deterministic_cudnn: bool = False) -> None:
|
|||||||
:param deterministic_cudnn: Enable for full reproducibility when using CUDA. Caution: might slow down training.
|
:param deterministic_cudnn: Enable for full reproducibility when using CUDA. Caution: might slow down training.
|
||||||
"""
|
"""
|
||||||
random.seed(seed)
|
random.seed(seed)
|
||||||
np.random.seed(seed)
|
|
||||||
os.environ["PYTHONHASHSEED"] = str(seed)
|
os.environ["PYTHONHASHSEED"] = str(seed)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -2,9 +2,13 @@
|
|||||||
#
|
#
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import numpy as np
|
from numpy import exp
|
||||||
|
|
||||||
|
|
||||||
def expit(x: float) -> float:
|
def expit(x) -> float:
|
||||||
"""Compute logistic sigmoid function. Maps input values to a range between 0 and 1"""
|
"""
|
||||||
return 1 / (1 + np.exp(-x))
|
Compute logistic sigmoid function. Maps input values to a range between 0 and 1
|
||||||
|
|
||||||
|
:param x: input value. Can be a scalar or a numpy array.
|
||||||
|
"""
|
||||||
|
return 1 / (1 + exp(-x))
|
||||||
|
|||||||
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
enhancements:
|
||||||
|
- |
|
||||||
|
Reduced numpy usage to speed up imports.
|
||||||
@ -4,9 +4,9 @@
|
|||||||
import os
|
import os
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import random
|
||||||
import pytest
|
import pytest
|
||||||
from huggingface_hub.utils import RepositoryNotFoundError
|
from huggingface_hub.utils import RepositoryNotFoundError
|
||||||
from numpy import array, random
|
|
||||||
|
|
||||||
from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
|
from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
|
||||||
from haystack.dataclasses import Document
|
from haystack.dataclasses import Document
|
||||||
@ -24,7 +24,7 @@ def mock_check_valid_model():
|
|||||||
|
|
||||||
|
|
||||||
def mock_embedding_generation(json, **kwargs):
|
def mock_embedding_generation(json, **kwargs):
|
||||||
response = str(array([random.rand(384) for i in range(len(json["inputs"]))]).tolist()).encode()
|
response = str([[random.random() for _ in range(384)] for _ in range(len(json["inputs"]))]).encode()
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -4,9 +4,9 @@
|
|||||||
import os
|
import os
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import random
|
||||||
import pytest
|
import pytest
|
||||||
from huggingface_hub.utils import RepositoryNotFoundError
|
from huggingface_hub.utils import RepositoryNotFoundError
|
||||||
from numpy import array, random
|
|
||||||
|
|
||||||
from haystack.components.embedders import HuggingFaceAPITextEmbedder
|
from haystack.components.embedders import HuggingFaceAPITextEmbedder
|
||||||
from haystack.utils.auth import Secret
|
from haystack.utils.auth import Secret
|
||||||
@ -22,7 +22,7 @@ def mock_check_valid_model():
|
|||||||
|
|
||||||
|
|
||||||
def mock_embedding_generation(json, **kwargs):
|
def mock_embedding_generation(json, **kwargs):
|
||||||
response = str(array([random.rand(384) for i in range(len(json["inputs"]))]).tolist()).encode()
|
response = str([[random.random() for _ in range(384)] for _ in range(len(json["inputs"]))]).encode()
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -5,7 +5,7 @@ import os
|
|||||||
from typing import List
|
from typing import List
|
||||||
from haystack.utils.auth import Secret
|
from haystack.utils.auth import Secret
|
||||||
|
|
||||||
import numpy as np
|
import random
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from haystack import Document
|
from haystack import Document
|
||||||
@ -16,7 +16,8 @@ def mock_openai_response(input: List[str], model: str = "text-embedding-ada-002"
|
|||||||
dict_response = {
|
dict_response = {
|
||||||
"object": "list",
|
"object": "list",
|
||||||
"data": [
|
"data": [
|
||||||
{"object": "embedding", "index": i, "embedding": np.random.rand(1536).tolist()} for i in range(len(input))
|
{"object": "embedding", "index": i, "embedding": [random.random() for _ in range(1536)]}
|
||||||
|
for i in range(len(input))
|
||||||
],
|
],
|
||||||
"model": model,
|
"model": model,
|
||||||
"usage": {"prompt_tokens": 4, "total_tokens": 4},
|
"usage": {"prompt_tokens": 4, "total_tokens": 4},
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
import numpy as np
|
import random
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@ -264,7 +264,9 @@ class TestSentenceTransformersDocumentEmbedder:
|
|||||||
def test_run(self):
|
def test_run(self):
|
||||||
embedder = SentenceTransformersDocumentEmbedder(model="model")
|
embedder = SentenceTransformersDocumentEmbedder(model="model")
|
||||||
embedder.embedding_backend = MagicMock()
|
embedder.embedding_backend = MagicMock()
|
||||||
embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist()
|
embedder.embedding_backend.embed = lambda x, **kwargs: [
|
||||||
|
[random.random() for _ in range(16)] for _ in range(len(x))
|
||||||
|
]
|
||||||
|
|
||||||
documents = [Document(content=f"document number {i}") for i in range(5)]
|
documents = [Document(content=f"document number {i}") for i in range(5)]
|
||||||
|
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import random
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from haystack.components.embedders.sentence_transformers_text_embedder import SentenceTransformersTextEmbedder
|
from haystack.components.embedders.sentence_transformers_text_embedder import SentenceTransformersTextEmbedder
|
||||||
@ -239,7 +239,9 @@ class TestSentenceTransformersTextEmbedder:
|
|||||||
def test_run(self):
|
def test_run(self):
|
||||||
embedder = SentenceTransformersTextEmbedder(model="model")
|
embedder = SentenceTransformersTextEmbedder(model="model")
|
||||||
embedder.embedding_backend = MagicMock()
|
embedder.embedding_backend = MagicMock()
|
||||||
embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist()
|
embedder.embedding_backend.embed = lambda x, **kwargs: [
|
||||||
|
[random.random() for _ in range(16)] for _ in range(len(x))
|
||||||
|
]
|
||||||
|
|
||||||
text = "a nice text to embed"
|
text = "a nice text to embed"
|
||||||
|
|
||||||
|
|||||||
@ -5,7 +5,6 @@ import os
|
|||||||
import math
|
import math
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from haystack import Pipeline
|
from haystack import Pipeline
|
||||||
|
|||||||
@ -4,7 +4,6 @@
|
|||||||
from typing import Dict, Any
|
from typing import Dict, Any
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from haystack import Pipeline, DeserializationError
|
from haystack import Pipeline, DeserializationError
|
||||||
from haystack.document_stores.types import FilterPolicy
|
from haystack.document_stores.types import FilterPolicy
|
||||||
@ -135,7 +134,7 @@ class TestMemoryEmbeddingRetriever:
|
|||||||
|
|
||||||
assert "documents" in result
|
assert "documents" in result
|
||||||
assert len(result["documents"]) == top_k
|
assert len(result["documents"]) == top_k
|
||||||
assert np.array_equal(result["documents"][0].embedding, [1.0, 1.0, 1.0, 1.0])
|
assert result["documents"][0].embedding == [1.0, 1.0, 1.0, 1.0]
|
||||||
|
|
||||||
def test_invalid_run_wrong_store_type(self):
|
def test_invalid_run_wrong_store_type(self):
|
||||||
SomeOtherDocumentStore = document_store_class("SomeOtherDocumentStore")
|
SomeOtherDocumentStore = document_store_class("SomeOtherDocumentStore")
|
||||||
@ -165,4 +164,4 @@ class TestMemoryEmbeddingRetriever:
|
|||||||
results_docs = result["retriever"]["documents"]
|
results_docs = result["retriever"]["documents"]
|
||||||
assert results_docs
|
assert results_docs
|
||||||
assert len(results_docs) == top_k
|
assert len(results_docs) == top_k
|
||||||
assert np.array_equal(results_docs[0].embedding, [1.0, 1.0, 1.0, 1.0])
|
assert results_docs[0].embedding == [1.0, 1.0, 1.0, 1.0]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user