refactor!: extract elasticsearch (#4668)

* extract elasticsearch

* update pyproject.toml

* make more import optional

* move MockBaseRetriever in conftest

* install es in the es integration tests
This commit is contained in:
ZanSara 2023-04-26 10:14:20 +02:00 committed by GitHub
parent 91b775bf43
commit 1b57b96210
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 71 additions and 50 deletions

View File

@ -242,7 +242,7 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Haystack
run: pip install .[dev,preprocessing]
run: pip install .[elasticsearch,dev,preprocessing]
- name: Run tests
run: |

View File

@ -1,5 +1,3 @@
import os
import importlib
from haystack.utils.import_utils import safe_import
from haystack.document_stores.base import BaseDocumentStore, BaseKnowledgeGraph, KeywordDocumentStore
@ -7,12 +5,15 @@ from haystack.document_stores.memory import InMemoryDocumentStore
from haystack.document_stores.deepsetcloud import DeepsetCloudDocumentStore
from haystack.document_stores.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack.document_stores.es_converter import (
elasticsearch_index_to_document_store,
open_search_index_to_document_store,
elasticsearch_index_to_document_store = safe_import(
"haystack.document_stores.es_converter", "elasticsearch_index_to_document_store", "elasticsearch"
)
open_search_index_to_document_store = safe_import(
"haystack.document_stores.es_converter", "open_search_index_to_document_store", "elasticsearch"
)
ElasticsearchDocumentStore = safe_import(
"haystack.document_stores.elasticsearch", "ElasticsearchDocumentStore", "elasticsearch"
)
OpenSearchDocumentStore = safe_import("haystack.document_stores.opensearch", "OpenSearchDocumentStore", "opensearch")
SQLDocumentStore = safe_import("haystack.document_stores.sql", "SQLDocumentStore", "sql")
FAISSDocumentStore = safe_import("haystack.document_stores.faiss", "FAISSDocumentStore", "faiss")

View File

@ -1,7 +1,13 @@
from typing import Dict, Optional, List, Union
from tqdm.auto import tqdm
from elasticsearch.helpers import scan
try:
from elasticsearch.helpers import scan
except (ImportError, ModuleNotFoundError) as ie:
from haystack.utils.import_utils import _optional_component_not_installed
_optional_component_not_installed(__name__, "elasticsearch", ie)
from haystack.schema import Document
from haystack.document_stores.base import BaseDocumentStore

View File

@ -65,7 +65,13 @@ from pathlib import Path
from itertools import islice
from tqdm.auto import tqdm
from elasticsearch import Elasticsearch
try:
from elasticsearch import Elasticsearch
except (ImportError, ModuleNotFoundError) as ie:
from haystack.utils.import_utils import _optional_component_not_installed
_optional_component_not_installed(__name__, "elasticsearch", ie)
from haystack.document_stores.base import BaseDocumentStore
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore # keep it here !

View File

@ -74,9 +74,6 @@ dependencies = [
# See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
"sentence-transformers>=2.2.0",
# Elasticsearch
"elasticsearch>=7.17,<8",
# OpenAI tokenizer
"tiktoken>=0.3.0; python_version >= '3.8' and (platform_machine == 'AMD64' or platform_machine == 'amd64' or platform_machine == 'x86_64' or (platform_machine == 'arm64' and platform_system == 'Darwin'))",
@ -91,6 +88,9 @@ dependencies = [
]
[project.optional-dependencies]
elasticsearch = [
"elasticsearch>=7.17,<8",
]
sql = [
"sqlalchemy>=1.4.2,<2",
"sqlalchemy_utils",
@ -135,10 +135,10 @@ opensearch = [
"opensearch-py>=2",
]
docstores = [
"farm-haystack[faiss,milvus,weaviate,graphdb,inmemorygraph,pinecone,opensearch]",
"farm-haystack[elasticsearch,faiss,milvus,weaviate,graphdb,inmemorygraph,pinecone,opensearch]",
]
docstores-gpu = [
"farm-haystack[faiss-gpu,milvus,weaviate,graphdb,inmemorygraph,pinecone,opensearch]",
"farm-haystack[elasticsearch,faiss-gpu,milvus,weaviate,graphdb,inmemorygraph,pinecone,opensearch]",
]
audio = [
"openai-whisper"

View File

@ -296,6 +296,38 @@ class MockRetriever(BaseRetriever):
return [[]]
class MockBaseRetriever(MockRetriever):
def __init__(self, document_store: BaseDocumentStore, mock_document: Document):
self.document_store = document_store
self.mock_document = mock_document
def retrieve(
self,
query: str,
filters: dict,
top_k: Optional[int],
index: str,
headers: Optional[Dict[str, str]],
scale_score: bool,
):
return [self.mock_document]
def retrieve_batch(
self,
queries: List[str],
filters: Optional[Union[FilterType, List[Optional[FilterType]]]] = None,
top_k: Optional[int] = None,
index: str = None,
headers: Optional[Dict[str, str]] = None,
batch_size: Optional[int] = None,
scale_score: bool = None,
):
return [[self.mock_document] for _ in range(len(queries))]
def embed_documents(self, documents: List[Document]):
return np.full((len(documents), 768), 0.5)
class MockSeq2SegGenerator(BaseGenerator):
def predict(self, query: str, documents: List[Document], top_k: Optional[int]) -> Dict:
pass

View File

@ -13,7 +13,7 @@ from haystack.errors import FilterError, PineconeDocumentStoreError
from haystack.testing import DocumentStoreBaseTestAbstract
from ..mocks import pinecone as pinecone_mock
from ..nodes.test_retriever import MockBaseRetriever
from ..conftest import MockBaseRetriever
# Set metadata fields used during testing for PineconeDocumentStore meta_config
META_FIELDS = ["meta_field", "name", "date", "numeric_field", "odd_document"]

View File

@ -10,9 +10,17 @@ import pandas as pd
import requests
from boilerpy3.extractors import ArticleExtractor
from pandas.testing import assert_frame_equal
from elasticsearch import Elasticsearch
from transformers import DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast
try:
from elasticsearch import Elasticsearch
except (ImportError, ModuleNotFoundError) as ie:
from haystack.utils.import_utils import _optional_component_not_installed
_optional_component_not_installed(__name__, "elasticsearch", ie)
from haystack.document_stores.base import BaseDocumentStore, FilterType
from haystack.document_stores.memory import InMemoryDocumentStore
from haystack.document_stores import WeaviateDocumentStore
@ -27,7 +35,7 @@ from haystack.nodes.retriever.dense import DensePassageRetriever, EmbeddingRetri
from haystack.nodes.retriever.sparse import BM25Retriever, FilterRetriever, TfidfRetriever
from haystack.nodes.retriever.multimodal import MultiModalRetriever
from ..conftest import MockRetriever, fail_at_version
from ..conftest import MockBaseRetriever, fail_at_version
# TODO check if we this works with only "memory" arg
@ -131,38 +139,6 @@ def test_tfidf_retriever_multiple_indexes():
assert tfidf_retriever.document_counts["index_1"] == ds.get_document_count(index="index_1")
class MockBaseRetriever(MockRetriever):
def __init__(self, document_store: BaseDocumentStore, mock_document: Document):
self.document_store = document_store
self.mock_document = mock_document
def retrieve(
self,
query: str,
filters: dict,
top_k: Optional[int],
index: str,
headers: Optional[Dict[str, str]],
scale_score: bool,
):
return [self.mock_document]
def retrieve_batch(
self,
queries: List[str],
filters: Optional[Union[FilterType, List[Optional[FilterType]]]] = None,
top_k: Optional[int] = None,
index: str = None,
headers: Optional[Dict[str, str]] = None,
batch_size: Optional[int] = None,
scale_score: bool = None,
):
return [[self.mock_document] for _ in range(len(queries))]
def embed_documents(self, documents: List[Document]):
return np.full((len(documents), 768), 0.5)
def test_retrieval_empty_query(document_store: BaseDocumentStore):
# test with empty query using the run() method
mock_document = Document(id="0", content="test")