mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-02 10:49:30 +00:00
refactor!: extract elasticsearch (#4668)
* extract elasticsearch * update pyproject.toml * make more import optional * move MockBaseRetriever in conftest * install es in the es integration tests
This commit is contained in:
parent
91b775bf43
commit
1b57b96210
2
.github/workflows/tests.yml
vendored
2
.github/workflows/tests.yml
vendored
@ -242,7 +242,7 @@ jobs:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[dev,preprocessing]
|
||||
run: pip install .[elasticsearch,dev,preprocessing]
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
|
||||
@ -1,5 +1,3 @@
|
||||
import os
|
||||
import importlib
|
||||
from haystack.utils.import_utils import safe_import
|
||||
from haystack.document_stores.base import BaseDocumentStore, BaseKnowledgeGraph, KeywordDocumentStore
|
||||
|
||||
@ -7,12 +5,15 @@ from haystack.document_stores.memory import InMemoryDocumentStore
|
||||
from haystack.document_stores.deepsetcloud import DeepsetCloudDocumentStore
|
||||
from haystack.document_stores.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl
|
||||
|
||||
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
|
||||
from haystack.document_stores.es_converter import (
|
||||
elasticsearch_index_to_document_store,
|
||||
open_search_index_to_document_store,
|
||||
elasticsearch_index_to_document_store = safe_import(
|
||||
"haystack.document_stores.es_converter", "elasticsearch_index_to_document_store", "elasticsearch"
|
||||
)
|
||||
open_search_index_to_document_store = safe_import(
|
||||
"haystack.document_stores.es_converter", "open_search_index_to_document_store", "elasticsearch"
|
||||
)
|
||||
ElasticsearchDocumentStore = safe_import(
|
||||
"haystack.document_stores.elasticsearch", "ElasticsearchDocumentStore", "elasticsearch"
|
||||
)
|
||||
|
||||
OpenSearchDocumentStore = safe_import("haystack.document_stores.opensearch", "OpenSearchDocumentStore", "opensearch")
|
||||
SQLDocumentStore = safe_import("haystack.document_stores.sql", "SQLDocumentStore", "sql")
|
||||
FAISSDocumentStore = safe_import("haystack.document_stores.faiss", "FAISSDocumentStore", "faiss")
|
||||
|
||||
@ -1,7 +1,13 @@
|
||||
from typing import Dict, Optional, List, Union
|
||||
|
||||
from tqdm.auto import tqdm
|
||||
from elasticsearch.helpers import scan
|
||||
|
||||
try:
|
||||
from elasticsearch.helpers import scan
|
||||
except (ImportError, ModuleNotFoundError) as ie:
|
||||
from haystack.utils.import_utils import _optional_component_not_installed
|
||||
|
||||
_optional_component_not_installed(__name__, "elasticsearch", ie)
|
||||
|
||||
from haystack.schema import Document
|
||||
from haystack.document_stores.base import BaseDocumentStore
|
||||
|
||||
@ -65,7 +65,13 @@ from pathlib import Path
|
||||
from itertools import islice
|
||||
|
||||
from tqdm.auto import tqdm
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
try:
|
||||
from elasticsearch import Elasticsearch
|
||||
except (ImportError, ModuleNotFoundError) as ie:
|
||||
from haystack.utils.import_utils import _optional_component_not_installed
|
||||
|
||||
_optional_component_not_installed(__name__, "elasticsearch", ie)
|
||||
|
||||
from haystack.document_stores.base import BaseDocumentStore
|
||||
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore # keep it here !
|
||||
|
||||
@ -74,9 +74,6 @@ dependencies = [
|
||||
# See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
|
||||
"sentence-transformers>=2.2.0",
|
||||
|
||||
# Elasticsearch
|
||||
"elasticsearch>=7.17,<8",
|
||||
|
||||
# OpenAI tokenizer
|
||||
"tiktoken>=0.3.0; python_version >= '3.8' and (platform_machine == 'AMD64' or platform_machine == 'amd64' or platform_machine == 'x86_64' or (platform_machine == 'arm64' and platform_system == 'Darwin'))",
|
||||
|
||||
@ -91,6 +88,9 @@ dependencies = [
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
elasticsearch = [
|
||||
"elasticsearch>=7.17,<8",
|
||||
]
|
||||
sql = [
|
||||
"sqlalchemy>=1.4.2,<2",
|
||||
"sqlalchemy_utils",
|
||||
@ -135,10 +135,10 @@ opensearch = [
|
||||
"opensearch-py>=2",
|
||||
]
|
||||
docstores = [
|
||||
"farm-haystack[faiss,milvus,weaviate,graphdb,inmemorygraph,pinecone,opensearch]",
|
||||
"farm-haystack[elasticsearch,faiss,milvus,weaviate,graphdb,inmemorygraph,pinecone,opensearch]",
|
||||
]
|
||||
docstores-gpu = [
|
||||
"farm-haystack[faiss-gpu,milvus,weaviate,graphdb,inmemorygraph,pinecone,opensearch]",
|
||||
"farm-haystack[elasticsearch,faiss-gpu,milvus,weaviate,graphdb,inmemorygraph,pinecone,opensearch]",
|
||||
]
|
||||
audio = [
|
||||
"openai-whisper"
|
||||
|
||||
@ -296,6 +296,38 @@ class MockRetriever(BaseRetriever):
|
||||
return [[]]
|
||||
|
||||
|
||||
class MockBaseRetriever(MockRetriever):
|
||||
def __init__(self, document_store: BaseDocumentStore, mock_document: Document):
|
||||
self.document_store = document_store
|
||||
self.mock_document = mock_document
|
||||
|
||||
def retrieve(
|
||||
self,
|
||||
query: str,
|
||||
filters: dict,
|
||||
top_k: Optional[int],
|
||||
index: str,
|
||||
headers: Optional[Dict[str, str]],
|
||||
scale_score: bool,
|
||||
):
|
||||
return [self.mock_document]
|
||||
|
||||
def retrieve_batch(
|
||||
self,
|
||||
queries: List[str],
|
||||
filters: Optional[Union[FilterType, List[Optional[FilterType]]]] = None,
|
||||
top_k: Optional[int] = None,
|
||||
index: str = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
batch_size: Optional[int] = None,
|
||||
scale_score: bool = None,
|
||||
):
|
||||
return [[self.mock_document] for _ in range(len(queries))]
|
||||
|
||||
def embed_documents(self, documents: List[Document]):
|
||||
return np.full((len(documents), 768), 0.5)
|
||||
|
||||
|
||||
class MockSeq2SegGenerator(BaseGenerator):
|
||||
def predict(self, query: str, documents: List[Document], top_k: Optional[int]) -> Dict:
|
||||
pass
|
||||
|
||||
@ -13,7 +13,7 @@ from haystack.errors import FilterError, PineconeDocumentStoreError
|
||||
from haystack.testing import DocumentStoreBaseTestAbstract
|
||||
|
||||
from ..mocks import pinecone as pinecone_mock
|
||||
from ..nodes.test_retriever import MockBaseRetriever
|
||||
from ..conftest import MockBaseRetriever
|
||||
|
||||
# Set metadata fields used during testing for PineconeDocumentStore meta_config
|
||||
META_FIELDS = ["meta_field", "name", "date", "numeric_field", "odd_document"]
|
||||
|
||||
@ -10,9 +10,17 @@ import pandas as pd
|
||||
import requests
|
||||
from boilerpy3.extractors import ArticleExtractor
|
||||
from pandas.testing import assert_frame_equal
|
||||
from elasticsearch import Elasticsearch
|
||||
from transformers import DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast
|
||||
|
||||
|
||||
try:
|
||||
from elasticsearch import Elasticsearch
|
||||
except (ImportError, ModuleNotFoundError) as ie:
|
||||
from haystack.utils.import_utils import _optional_component_not_installed
|
||||
|
||||
_optional_component_not_installed(__name__, "elasticsearch", ie)
|
||||
|
||||
|
||||
from haystack.document_stores.base import BaseDocumentStore, FilterType
|
||||
from haystack.document_stores.memory import InMemoryDocumentStore
|
||||
from haystack.document_stores import WeaviateDocumentStore
|
||||
@ -27,7 +35,7 @@ from haystack.nodes.retriever.dense import DensePassageRetriever, EmbeddingRetri
|
||||
from haystack.nodes.retriever.sparse import BM25Retriever, FilterRetriever, TfidfRetriever
|
||||
from haystack.nodes.retriever.multimodal import MultiModalRetriever
|
||||
|
||||
from ..conftest import MockRetriever, fail_at_version
|
||||
from ..conftest import MockBaseRetriever, fail_at_version
|
||||
|
||||
|
||||
# TODO check if we this works with only "memory" arg
|
||||
@ -131,38 +139,6 @@ def test_tfidf_retriever_multiple_indexes():
|
||||
assert tfidf_retriever.document_counts["index_1"] == ds.get_document_count(index="index_1")
|
||||
|
||||
|
||||
class MockBaseRetriever(MockRetriever):
|
||||
def __init__(self, document_store: BaseDocumentStore, mock_document: Document):
|
||||
self.document_store = document_store
|
||||
self.mock_document = mock_document
|
||||
|
||||
def retrieve(
|
||||
self,
|
||||
query: str,
|
||||
filters: dict,
|
||||
top_k: Optional[int],
|
||||
index: str,
|
||||
headers: Optional[Dict[str, str]],
|
||||
scale_score: bool,
|
||||
):
|
||||
return [self.mock_document]
|
||||
|
||||
def retrieve_batch(
|
||||
self,
|
||||
queries: List[str],
|
||||
filters: Optional[Union[FilterType, List[Optional[FilterType]]]] = None,
|
||||
top_k: Optional[int] = None,
|
||||
index: str = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
batch_size: Optional[int] = None,
|
||||
scale_score: bool = None,
|
||||
):
|
||||
return [[self.mock_document] for _ in range(len(queries))]
|
||||
|
||||
def embed_documents(self, documents: List[Document]):
|
||||
return np.full((len(documents), 768), 0.5)
|
||||
|
||||
|
||||
def test_retrieval_empty_query(document_store: BaseDocumentStore):
|
||||
# test with empty query using the run() method
|
||||
mock_document = Document(id="0", content="test")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user