Document Store test refactoring (#3449)

* add new marker

* start using test hierarchies

* move ES tests into their own class

* refactor test workflow

* job steps

* add more tests

* move more tests

* more tests

* test labels

* add more tests

* Update tests.yml

* Update tests.yml

* fix

* typo

* fix es image tag

* map es ports

* try

* fix

* default port

* remove opensearch from the markers sorcery

* revert

* skip new tests in old jobs

* skip opensearch_faiss
This commit is contained in:
Massimiliano Pippi 2022-10-31 15:30:14 +01:00 committed by GitHub
parent 85cdc1040a
commit b694c7b5cb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 845 additions and 560 deletions

View File

@ -92,17 +92,22 @@ jobs:
if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main'
unit-tests:
name: Unit / ${{ matrix.os }}
name: Unit / ${{ matrix.topic }} / ${{ matrix.os }}
needs:
- mypy
- pylint
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
os:
- ubuntu-latest
- windows-latest
- macos-latest
topic:
- document_stores
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Setup Python
uses: ./.github/actions/python_cache/
@ -111,7 +116,7 @@ jobs:
run: pip install .[all]
- name: Run
run: pytest -m "unit" test/
run: pytest -m "unit" test/${{ matrix.topic }}
- uses: act10ns/slack@v1
with:
@ -119,6 +124,86 @@ jobs:
channel: '#haystack'
if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main'
integration-tests-elasticsearch:
name: Integration / Elasticsearch / ${{ matrix.os }}
needs:
- unit-tests
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
services:
elasticsearch:
image: elasticsearch:7.17.6
env:
discovery.type: "single-node"
ES_JAVA_OPTS: "-Xms128m -Xmx256m"
ports:
- 9200:9200
# env:
# ELASTICSEARCH_HOST: "elasticsearch"
steps:
- uses: actions/checkout@v3
- name: Setup Python
uses: ./.github/actions/python_cache/
- name: Install Haystack
run: pip install -U .[docstores]
- name: Run tests
run: |
pytest -x -m "document_store and integration" test/document_stores/test_elasticsearch.py
- uses: act10ns/slack@v1
with:
status: ${{ job.status }}
channel: '#haystack'
if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main'
integration-tests-opensearch:
name: Integration / Opensearch / ${{ matrix.os }}
needs:
- unit-tests
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
services:
opensearch:
image: opensearchproject/opensearch:1.3.5
env:
discovery.type: "single-node"
ES_JAVA_OPTS: "-Xms128m -Xmx256m"
ports:
- 9200:9200
# env:
# OPENSEARCH_HOST: "opensearch"
steps:
- uses: actions/checkout@v3
- name: Setup Python
uses: ./.github/actions/python_cache/
- name: Install Haystack
run: pip install -U .[docstores]
- name: Run tests
run: |
pytest -x -m "document_store and integration" test/document_stores/test_opensearch.py
- uses: act10ns/slack@v1
with:
status: ${{ job.status }}
channel: '#haystack'
if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main'
#
# TODO: the following steps need to be revisited
#
unit-tests-linux:
needs:
- mypy
@ -216,117 +301,6 @@ jobs:
channel: '#haystack'
if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main'
elasticsearch-tests-linux:
needs:
- mypy
- pylint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Setup Elasticsearch
run: |
docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
# TODO Let's try to remove this one from the unit tests
- name: Install pdftotext
run: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
- name: Setup Python
uses: ./.github/actions/python_cache/
- name: Install Haystack
run: pip install .
- name: Run tests
env:
TOKENIZERS_PARALLELISM: 'false'
run: |
pytest ${{ env.PYTEST_PARAMS }} -m "elasticsearch and not integration" test/document_stores/ --document_store_type=elasticsearch
- name: Dump docker logs on failure
if: failure()
uses: jwalton/gh-docker-logs@v1
- uses: act10ns/slack@v1
with:
status: ${{ job.status }}
channel: '#haystack'
if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main'
elasticsearch-tests-windows:
needs:
- mypy
- pylint
runs-on: windows-latest
if: contains(github.event.pull_request.labels.*.name, 'topic:windows') || !github.event.pull_request.draft
steps:
- uses: actions/checkout@v2
- name: Install dependencies
run: |
choco install --no-progress xpdf-utils
choco install --no-progress openjdk --version=11.0.2.01
refreshenv
choco install --no-progress elasticsearch --version=7.9.2
refreshenv
Get-Service elasticsearch-service-x64 | Start-Service
- name: Setup Python
uses: ./.github/actions/python_cache/
with:
prefix: windows
- name: Run tests
env:
TOKENIZERS_PARALLELISM: 'false'
run: |
pytest ${{ env.PYTEST_PARAMS }} -m "elasticsearch and not integration" test/document_stores/ ${{ env.SUITES_EXCLUDED_FROM_WINDOWS }} --document_store_type=elasticsearch
- uses: act10ns/slack@v1
with:
status: ${{ job.status }}
channel: '#haystack'
if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main'
opensearch-tests-linux:
needs:
- mypy
- pylint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Setup Opensearch
run: |
docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.3.5
# TODO Let's try to remove this one from the unit tests
- name: Install pdftotext
run: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
- name: Setup Python
uses: ./.github/actions/python_cache/
- name: Install Haystack
run: pip install .
- name: Run tests
env:
TOKENIZERS_PARALLELISM: 'false'
run: |
pytest ${{ env.PYTEST_PARAMS }} -m "opensearch and not integration" test/document_stores/test_document_store.py --document_store_type=opensearch
- name: Dump docker logs on failure
if: failure()
uses: jwalton/gh-docker-logs@v1
- uses: act10ns/slack@v1
with:
status: ${{ job.status }}
channel: '#haystack'
if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main'
faiss-tests-linux:
needs:
@ -656,7 +630,6 @@ jobs:
integration-tests-linux:
needs:
- unit-tests-linux
- elasticsearch-tests-linux
timeout-minutes: 60
strategy:
@ -691,7 +664,6 @@ jobs:
run: |
python -c "from transformers import AutoModel;[AutoModel.from_pretrained(model_name) for model_name in ['vblagoje/bart_lfqa','yjernite/bart_eli5', 'vblagoje/dpr-ctx_encoder-single-lfqa-wiki', 'vblagoje/dpr-question_encoder-single-lfqa-wiki', 'facebook/dpr-question_encoder-single-nq-base', 'facebook/dpr-ctx_encoder-single-nq-base', 'elastic/distilbert-base-cased-finetuned-conll03-english', 'deepset/bert-medium-squad2-distilled']]"
- name: Run Elasticsearch
run: |
docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
@ -738,8 +710,9 @@ jobs:
- name: Run tests
env:
TOKENIZERS_PARALLELISM: 'false' # Avoid logspam by tokenizers
# we add "and not document_store" to exclude the tests that were ported to the new strategy
run: |
pytest ${{ env.PYTEST_PARAMS }} -m "integration" test/${{ matrix.folder }}
pytest ${{ env.PYTEST_PARAMS }} -m "integration and not document_store" test/${{ matrix.folder }}
- name: Dump docker logs on failure
if: failure()
@ -754,7 +727,6 @@ jobs:
integration-tests-windows:
needs:
- unit-tests-windows
- elasticsearch-tests-windows
runs-on: windows-latest
if: contains(github.event.pull_request.labels.*.name, 'topic:windows') || !github.event.pull_request.draft
@ -800,4 +772,4 @@ jobs:
with:
status: ${{ job.status }}
channel: '#haystack'
if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main'
if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main'

View File

@ -2,7 +2,7 @@ def pytest_addoption(parser):
parser.addoption(
"--document_store_type",
action="store",
default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate, pinecone, opensearch",
default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate, pinecone",
)

View File

@ -351,6 +351,7 @@ markers = [
"milvus: requires a Milvus 2 setup",
"milvus1: requires a Milvus 1 container",
"opensearch",
"document_store",
]
log_cli = true

View File

@ -152,7 +152,6 @@ def pytest_collection_modifyitems(config, items):
"pinecone": [pytest.mark.pinecone],
# FIXME GraphDB can't be treated as a regular docstore, it fails most of their tests
"graphdb": [pytest.mark.integration],
"opensearch": [pytest.mark.opensearch],
}
for item in items:
for name, markers in name_to_markers.items():
@ -196,17 +195,7 @@ def infer_required_doc_store(item, keywords):
# 2. if the test name contains the docstore name, we use that
# 3. use an arbitrary one by calling set.pop()
required_doc_store = None
all_doc_stores = {
"elasticsearch",
"faiss",
"sql",
"memory",
"milvus1",
"milvus",
"weaviate",
"pinecone",
"opensearch",
}
all_doc_stores = {"elasticsearch", "faiss", "sql", "memory", "milvus1", "milvus", "weaviate", "pinecone"}
docstore_markers = set(keywords).intersection(all_doc_stores)
if len(docstore_markers) > 1:
# if parameterized infer the docstore from the parameter
@ -1109,18 +1098,6 @@ def get_document_store(
knn_engine="faiss",
)
elif document_store_type == "opensearch":
document_store = OpenSearchDocumentStore(
index=index,
return_embedding=True,
embedding_dim=embedding_dim,
embedding_field=embedding_field,
similarity=similarity,
recreate_index=recreate_index,
port=9201,
knn_engine="nmslib",
)
else:
raise Exception(f"No document store fixture for '{document_store_type}'")

View File

@ -0,0 +1,445 @@
import pytest
import numpy as np
from haystack.schema import Document, Label, Answer
from haystack.errors import DuplicateDocumentError
from haystack.document_stores import BaseDocumentStore
@pytest.mark.document_store
class DocumentStoreBaseTestAbstract:
"""
This is a base class to test abstract methods from DocumentStoreBase to be inherited by any Document Store
testsuite. It doesn't have the `Test` prefix in the name so that its methods won't be collected for this
class but only for its subclasses.
"""
@pytest.fixture
def documents(self):
documents = []
for i in range(3):
documents.append(
Document(
content=f"A Foo Document {i}",
meta={"name": f"name_{i}", "year": "2020", "month": "01", "numbers": [2, 4]},
embedding=np.random.rand(768).astype(np.float32),
)
)
documents.append(
Document(
content=f"A Bar Document {i}",
meta={"name": f"name_{i}", "year": "2021", "month": "02", "numbers": [-2, -4]},
embedding=np.random.rand(768).astype(np.float32),
)
)
documents.append(
Document(
content=f"Document {i} without embeddings",
meta={"name": f"name_{i}", "no_embedding": True, "month": "03"},
)
)
return documents
@pytest.fixture
def labels(self, documents):
labels = []
for i, d in enumerate(documents):
labels.append(
Label(
query=f"query_{i}",
document=d,
is_correct_document=True,
is_correct_answer=False,
# create a mix set of labels
origin="user-feedback" if i % 2 else "gold-label",
answer=None if not i else Answer(f"the answer is {i}"),
meta={"name": f"label_{i}", "year": f"{2020 + i}"},
)
)
return labels
#
# Integration tests
#
@pytest.mark.integration
def test_write_documents(self, ds, documents):
ds.write_documents(documents)
docs = ds.get_all_documents()
assert len(docs) == len(documents)
for i, doc in enumerate(docs):
expected = documents[i]
assert doc.id == expected.id
@pytest.mark.integration
def test_write_labels(self, ds, labels):
ds.write_labels(labels)
assert ds.get_all_labels() == labels
@pytest.mark.integration
def test_write_with_duplicate_doc_ids(self, ds):
duplicate_documents = [
Document(content="Doc1", id_hash_keys=["content"]),
Document(content="Doc1", id_hash_keys=["content"]),
]
ds.write_documents(duplicate_documents, duplicate_documents="skip")
assert len(ds.get_all_documents()) == 1
with pytest.raises(Exception):
ds.write_documents(duplicate_documents, duplicate_documents="fail")
@pytest.mark.skip
@pytest.mark.integration
def test_get_all_documents_without_filters(self, ds, documents):
ds.write_documents(documents)
out = ds.get_all_documents()
assert out == documents
@pytest.mark.integration
def test_get_all_document_filter_duplicate_text_value(self, ds):
documents = [
Document(content="duplicated", meta={"meta_field": "0"}, id_hash_keys=["meta"]),
Document(content="duplicated", meta={"meta_field": "1", "name": "file.txt"}, id_hash_keys=["meta"]),
Document(content="Doc2", meta={"name": "file_2.txt"}, id_hash_keys=["meta"]),
]
ds.write_documents(documents)
documents = ds.get_all_documents(filters={"meta_field": ["1"]})
assert len(documents) == 1
assert documents[0].content == "duplicated"
assert documents[0].meta["name"] == "file.txt"
documents = ds.get_all_documents(filters={"meta_field": ["0"]})
assert len(documents) == 1
assert documents[0].content == "duplicated"
assert documents[0].meta.get("name") is None
documents = ds.get_all_documents(filters={"name": ["file_2.txt"]})
assert len(documents) == 1
assert documents[0].content == "Doc2"
assert documents[0].meta.get("meta_field") is None
@pytest.mark.integration
def test_get_all_documents_with_correct_filters(self, ds, documents):
ds.write_documents(documents)
result = ds.get_all_documents(filters={"year": ["2020"]})
assert len(result) == 3
documents = ds.get_all_documents(filters={"year": ["2020", "2021"]})
assert len(documents) == 6
@pytest.mark.integration
def test_get_all_documents_with_incorrect_filter_name(self, ds, documents):
ds.write_documents(documents)
result = ds.get_all_documents(filters={"non_existing_meta_field": ["whatever"]})
assert len(result) == 0
@pytest.mark.integration
def test_get_all_documents_with_incorrect_filter_value(self, ds, documents):
ds.write_documents(documents)
result = ds.get_all_documents(filters={"year": ["nope"]})
assert len(result) == 0
@pytest.mark.integration
def test_extended_filter(self, ds, documents):
ds.write_documents(documents)
# Test comparison operators individually
result = ds.get_all_documents(filters={"year": {"$eq": "2020"}})
assert len(result) == 3
result = ds.get_all_documents(filters={"year": "2020"})
assert len(result) == 3
result = ds.get_all_documents(filters={"year": {"$in": ["2020", "2021", "n.a."]}})
assert len(result) == 6
result = ds.get_all_documents(filters={"year": ["2020", "2021", "n.a."]})
assert len(result) == 6
result = ds.get_all_documents(filters={"year": {"$ne": "2020"}})
assert len(result) == 6
result = ds.get_all_documents(filters={"year": {"$nin": ["2020", "2021", "n.a."]}})
assert len(result) == 3
result = ds.get_all_documents(filters={"numbers": {"$gt": 0}})
assert len(result) == 3
result = ds.get_all_documents(filters={"numbers": {"$gte": -2}})
assert len(result) == 6
result = ds.get_all_documents(filters={"numbers": {"$lt": 0}})
assert len(result) == 3
result = ds.get_all_documents(filters={"numbers": {"$lte": 2.0}})
assert len(result) == 6
# Test compound filters
result = ds.get_all_documents(filters={"year": {"$lte": "2021", "$gte": "2020"}})
assert len(result) == 6
filters = {"$and": {"year": {"$lte": "2021", "$gte": "2020"}, "name": {"$in": ["name_0", "name_1"]}}}
result = ds.get_all_documents(filters=filters)
assert len(result) == 4
filters_simplified = {"year": {"$lte": "2021", "$gte": "2020"}, "name": ["name_0", "name_1"]}
result = ds.get_all_documents(filters=filters_simplified)
assert len(result) == 4
filters = {
"$and": {
"year": {"$lte": "2021", "$gte": "2020"},
"$or": {"name": {"$in": ["name_0", "name_1"]}, "numbers": {"$lt": 5.0}},
}
}
result = ds.get_all_documents(filters=filters)
assert len(result) == 6
filters_simplified = {
"year": {"$lte": "2021", "$gte": "2020"},
"$or": {"name": {"$in": ["name_0", "name_2"]}, "numbers": {"$lt": 5.0}},
}
result = ds.get_all_documents(filters=filters_simplified)
assert len(result) == 6
filters = {
"$and": {
"year": {"$lte": "2021", "$gte": "2020"},
"$or": {
"name": {"$in": ["name_0", "name_1"]},
"$and": {"numbers": {"$lt": 5.0}, "$not": {"month": {"$eq": "01"}}},
},
}
}
result = ds.get_all_documents(filters=filters)
assert len(result) == 5
filters_simplified = {
"year": {"$lte": "2021", "$gte": "2020"},
"$or": {"name": ["name_0", "name_1"], "$and": {"numbers": {"$lt": 5.0}, "$not": {"month": {"$eq": "01"}}}},
}
result = ds.get_all_documents(filters=filters_simplified)
assert len(result) == 5
# Test nested logical operations within "$not", important as we apply De Morgan's laws in WeaviateDocumentstore
filters = {
"$not": {
"$or": {
"$and": {"numbers": {"$lt": 5.0}, "month": {"$ne": "01"}},
"$not": {"year": {"$lte": "2021", "$gte": "2020"}},
}
}
}
result = ds.get_all_documents(filters=filters)
docs_meta = result[0].meta["numbers"]
assert len(result) == 3
assert [2, 4] == docs_meta
# Test same logical operator twice on same level
filters = {
"$or": [
{"$and": {"name": {"$in": ["name_0", "name_1"]}, "year": {"$gte": "2020"}}},
{"$and": {"name": {"$in": ["name_0", "name_1"]}, "year": {"$lt": "2021"}}},
]
}
result = ds.get_all_documents(filters=filters)
docs_meta = [doc.meta["name"] for doc in result]
assert len(result) == 4
assert "name_0" in docs_meta
assert "name_2" not in docs_meta
@pytest.mark.integration
def test_get_document_by_id(self, ds, documents):
ds.write_documents(documents)
doc = ds.get_document_by_id(documents[0].id)
assert doc.id == documents[0].id
assert doc.content == documents[0].content
@pytest.mark.integration
def test_get_documents_by_id(self, ds, documents):
ds.write_documents(documents)
ids = [doc.id for doc in documents]
result = {doc.id for doc in ds.get_documents_by_id(ids, batch_size=2)}
assert set(ids) == result
@pytest.mark.integration
def test_get_document_count(self, ds, documents):
ds.write_documents(documents)
assert ds.get_document_count() == 9
assert ds.get_document_count(filters={"year": ["2020"]}) == 3
assert ds.get_document_count(filters={"month": ["02"]}) == 3
@pytest.mark.integration
def test_get_all_documents_generator(self, ds, documents):
ds.write_documents(documents)
assert len(list(ds.get_all_documents_generator(batch_size=2))) == 9
@pytest.mark.integration
def test_duplicate_documents_skip(self, ds, documents):
ds.write_documents(documents)
updated_docs = []
for d in documents:
updated_d = Document.from_dict(d.to_dict())
updated_d.meta["name"] = "Updated"
updated_docs.append(updated_d)
ds.write_documents(updated_docs, duplicate_documents="skip")
result = ds.get_all_documents()
assert result[0].meta["name"] == "name_0"
@pytest.mark.integration
def test_duplicate_documents_overwrite(self, ds, documents):
ds.write_documents(documents)
updated_docs = []
for d in documents:
updated_d = Document.from_dict(d.to_dict())
updated_d.meta["name"] = "Updated"
updated_docs.append(updated_d)
ds.write_documents(updated_docs, duplicate_documents="overwrite")
for doc in ds.get_all_documents():
assert doc.meta["name"] == "Updated"
@pytest.mark.integration
def test_duplicate_documents_fail(self, ds, documents):
ds.write_documents(documents)
updated_docs = []
for d in documents:
updated_d = Document.from_dict(d.to_dict())
updated_d.meta["name"] = "Updated"
updated_docs.append(updated_d)
with pytest.raises(DuplicateDocumentError):
ds.write_documents(updated_docs, duplicate_documents="fail")
@pytest.mark.integration
def test_write_document_meta(self, ds):
ds.write_documents(
[
{"content": "dict_without_meta", "id": "1"},
{"content": "dict_with_meta", "meta_field": "test2", "id": "2"},
Document(content="document_object_without_meta", id="3"),
Document(content="document_object_with_meta", meta={"meta_field": "test4"}, id="4"),
]
)
assert not ds.get_document_by_id("1").meta
assert ds.get_document_by_id("2").meta["meta_field"] == "test2"
assert not ds.get_document_by_id("3").meta
assert ds.get_document_by_id("4").meta["meta_field"] == "test4"
@pytest.mark.integration
def test_delete_documents(self, ds, documents):
ds.write_documents(documents)
ds.delete_documents()
assert ds.get_document_count() == 0
@pytest.mark.integration
def test_delete_documents_with_filters(self, ds, documents):
ds.write_documents(documents)
ds.delete_documents(filters={"year": ["2020", "2021"]})
documents = ds.get_all_documents()
assert ds.get_document_count() == 3
@pytest.mark.integration
def test_delete_documents_by_id(self, ds, documents):
ds.write_documents(documents)
docs_to_delete = ds.get_all_documents(filters={"year": ["2020"]})
ds.delete_documents(ids=[doc.id for doc in docs_to_delete])
assert ds.get_document_count() == 6
@pytest.mark.integration
def test_write_get_all_labels(self, ds, labels):
ds.write_labels(labels)
ds.write_labels(labels[:3], index="custom_index")
assert len(ds.get_all_labels()) == 9
assert len(ds.get_all_labels(index="custom_index")) == 3
# remove the index we created in this test
ds.delete_index("custom_index")
@pytest.mark.integration
def test_delete_labels(self, ds, labels):
ds.write_labels(labels)
ds.write_labels(labels[:3], index="custom_index")
ds.delete_labels()
ds.delete_labels(index="custom_index")
assert len(ds.get_all_labels()) == 0
assert len(ds.get_all_labels(index="custom_index")) == 0
# remove the index we created in this test
ds.delete_index("custom_index")
@pytest.mark.integration
def test_write_labels_duplicate(self, ds, labels):
# create a duplicate
dupe = Label.from_dict(labels[0].to_dict())
ds.write_labels(labels + [dupe])
# ensure the duplicate was discarded
assert len(ds.get_all_labels()) == len(labels)
@pytest.mark.integration
def test_delete_labels_by_id(self, ds, labels):
ds.write_labels(labels)
ds.delete_labels(ids=[labels[0].id])
assert len(ds.get_all_labels()) == len(labels) - 1
@pytest.mark.integration
def test_delete_labels_by_filter(self, ds, labels):
ds.write_labels(labels)
ds.delete_labels(filters={"query": "query_1"})
assert len(ds.get_all_labels()) == len(labels) - 1
@pytest.mark.integration
def test_delete_labels_by_filter_id(self, ds, labels):
ds.write_labels(labels)
# ids and filters are ANDed, the following should have no effect
ds.delete_labels(ids=[labels[0].id], filters={"query": "query_9"})
assert len(ds.get_all_labels()) == len(labels)
#
ds.delete_labels(ids=[labels[0].id], filters={"query": "query_0"})
assert len(ds.get_all_labels()) == len(labels) - 1
@pytest.mark.integration
def test_get_label_count(self, ds, labels):
ds.write_labels(labels)
assert ds.get_label_count() == len(labels)
@pytest.mark.integration
def test_delete_index(self, ds, documents):
ds.write_documents(documents, index="custom_index")
assert ds.get_document_count(index="custom_index") == len(documents)
ds.delete_index(index="custom_index")
with pytest.raises(Exception):
ds.get_document_count(index="custom_index")
@pytest.mark.integration
def test_update_meta(self, ds, documents):
ds.write_documents(documents)
doc = documents[0]
ds.update_document_meta(doc.id, meta={"year": "2099", "month": "12"})
doc = ds.get_document_by_id(doc.id)
assert doc.meta["year"] == "2099"
assert doc.meta["month"] == "12"
#
# Unit tests
#
@pytest.mark.unit
def test_normalize_embeddings_diff_shapes(self):
VEC_1 = np.array([0.1, 0.2, 0.3], dtype="float32")
BaseDocumentStore.normalize_embedding(VEC_1)
assert np.linalg.norm(VEC_1) - 1 < 0.01
VEC_1 = np.array([0.1, 0.2, 0.3], dtype="float32").reshape(1, -1)
BaseDocumentStore.normalize_embedding(VEC_1)
assert np.linalg.norm(VEC_1) - 1 < 0.01

View File

@ -77,84 +77,6 @@ DOCUMENTS = [
]
@pytest.mark.elasticsearch
def test_init_elastic_client():
# defaults
_ = ElasticsearchDocumentStore()
# list of hosts + single port
_ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=9200)
# list of hosts + list of ports (wrong)
with pytest.raises(Exception):
_ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=[9200])
# list of hosts + list
_ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=[9200, 9200])
# only api_key
with pytest.raises(Exception):
_ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test")
# api_key + id
_ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test", api_key_id="test")
@pytest.mark.elasticsearch
def test_init_elastic_doc_store_with_index_recreation():
index_name = "test_index_recreation"
label_index_name = "test_index_recreation_labels"
document_store = ElasticsearchDocumentStore(index=index_name, label_index=label_index_name)
documents = [Document(content="Doc1")]
labels = [
Label(
query="query",
document=documents[0],
is_correct_document=True,
is_correct_answer=False,
origin="user-feedback",
answer=None,
)
]
document_store.write_documents(documents, index=index_name)
document_store.write_labels(labels, index=label_index_name)
document_store = ElasticsearchDocumentStore(index=index_name, label_index=label_index_name, recreate_index=True)
docs = document_store.get_all_documents(index=index_name)
labels = document_store.get_all_labels(index=label_index_name)
assert len(docs) == 0
assert len(labels) == 0
@pytest.mark.elasticsearch
def test_elasticsearch_eq_filter():
documents = [
{"content": "some text", "id": "1", "keyword_field": ["x", "y", "z"], "number_field": [1, 2, 3, 4]},
{"content": "some text", "id": "2", "keyword_field": ["x", "y", "w"], "number_field": [1, 2, 3]},
{"content": "some text", "id": "3", "keyword_field": ["x", "z"], "number_field": [2, 4]},
{"content": "some text", "id": "4", "keyword_field": ["z", "x"], "number_field": [5, 6]},
{"content": "some text", "id": "5", "keyword_field": ["x", "y"], "number_field": [2, 3]},
]
index = "test_elasticsearch_eq_filter"
document_store = ElasticsearchDocumentStore(index=index, recreate_index=True)
document_store.write_documents(documents)
filter = {"keyword_field": {"$eq": ["z", "x"]}}
filtered_docs = document_store.get_all_documents(index=index, filters=filter)
assert len(filtered_docs) == 2
for doc in filtered_docs:
assert set(doc.meta["keyword_field"]) == {"x", "z"}
filter = {"number_field": {"$eq": [2, 3]}}
filtered_docs = document_store.query(query=None, index=index, filters=filter)
assert len(filtered_docs) == 1
assert filtered_docs[0].meta["number_field"] == [2, 3]
assert filtered_docs[0].id == "5"
def test_write_with_duplicate_doc_ids(document_store: BaseDocumentStore):
duplicate_documents = [
Document(content="Doc1", id_hash_keys=["content"]),
@ -1274,164 +1196,6 @@ def test_get_meta_values_by_key(document_store: BaseDocumentStore):
assert bucket["count"] == 1
@pytest.mark.elasticsearch
def test_elasticsearch_custom_fields():
document_store = ElasticsearchDocumentStore(
index="haystack_test_custom",
content_field="custom_text_field",
embedding_field="custom_embedding_field",
recreate_index=True,
)
doc_to_write = {"custom_text_field": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)}
document_store.write_documents([doc_to_write])
documents = document_store.get_all_documents(return_embedding=True)
assert len(documents) == 1
assert documents[0].content == "test"
np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding)
@pytest.mark.elasticsearch
def test_elasticsearch_delete_index():
client = Elasticsearch()
index_name = "haystack_test_deletion"
document_store = ElasticsearchDocumentStore(index=index_name)
# the index should exist
index_exists = client.indices.exists(index=index_name)
assert index_exists
document_store.delete_index(index_name)
# the index was deleted and should not exist
index_exists = client.indices.exists(index=index_name)
assert not index_exists
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
def test_elasticsearch_query_with_filters_and_missing_embeddings(document_store: ElasticsearchDocumentStore):
document_store.write_documents(DOCUMENTS)
document_without_embedding = Document(
content="Doc without embedding", meta={"name": "name_7", "year": "2021", "month": "04"}
)
document_store.write_documents([document_without_embedding])
filters = {"year": "2021"}
document_store.skip_missing_embeddings = False
with pytest.raises(RequestError):
document_store.query_by_embedding(np.random.rand(768), filters=filters)
document_store.skip_missing_embeddings = True
documents = document_store.query_by_embedding(np.random.rand(768), filters=filters)
assert len(documents) == 3
@pytest.mark.elasticsearch
def test_get_document_count_only_documents_without_embedding_arg():
documents = [
{
"content": "text1",
"id": "1",
"embedding": np.random.rand(768).astype(np.float32),
"meta_field_for_count": "a",
},
{
"content": "text2",
"id": "2",
"embedding": np.random.rand(768).astype(np.float64),
"meta_field_for_count": "b",
},
{"content": "text3", "id": "3", "embedding": np.random.rand(768).astype(np.float32).tolist()},
{"content": "text4", "id": "4", "meta_field_for_count": "b"},
{"content": "text5", "id": "5", "meta_field_for_count": "b"},
{"content": "text6", "id": "6", "meta_field_for_count": "c"},
{
"content": "text7",
"id": "7",
"embedding": np.random.rand(768).astype(np.float64),
"meta_field_for_count": "c",
},
]
_index: str = "haystack_test_count"
document_store = ElasticsearchDocumentStore(index=_index, recreate_index=True)
document_store.write_documents(documents)
assert document_store.get_document_count() == 7
assert document_store.get_document_count(only_documents_without_embedding=True) == 3
assert (
document_store.get_document_count(
only_documents_without_embedding=True, filters={"meta_field_for_count": ["c"]}
)
== 1
)
assert (
document_store.get_document_count(
only_documents_without_embedding=True, filters={"meta_field_for_count": ["b"]}
)
== 2
)
@pytest.mark.elasticsearch
def test_skip_missing_embeddings(caplog):
documents = [
{"content": "text1", "id": "1"}, # a document without embeddings
{"content": "text2", "id": "2", "embedding": np.random.rand(768).astype(np.float64)},
{"content": "text3", "id": "3", "embedding": np.random.rand(768).astype(np.float32).tolist()},
{"content": "text4", "id": "4", "embedding": np.random.rand(768).astype(np.float32)},
]
document_store = ElasticsearchDocumentStore(index="skip_missing_embedding_index", recreate_index=True)
document_store.write_documents(documents)
document_store.skip_missing_embeddings = True
retrieved_docs = document_store.query_by_embedding(np.random.rand(768).astype(np.float32))
assert len(retrieved_docs) == 3
document_store.skip_missing_embeddings = False
with pytest.raises(RequestError):
document_store.query_by_embedding(np.random.rand(768).astype(np.float32))
# Test scenario with no embeddings for the entire index
documents = [
{"content": "text1", "id": "1"},
{"content": "text2", "id": "2"},
{"content": "text3", "id": "3"},
{"content": "text4", "id": "4"},
]
document_store.delete_documents()
document_store.write_documents(documents)
document_store.skip_missing_embeddings = True
with caplog.at_level(logging.WARNING):
document_store.query_by_embedding(np.random.rand(768).astype(np.float32))
assert "No documents with embeddings. Run the document store's update_embeddings() method." in caplog.text
@pytest.mark.elasticsearch
def test_elasticsearch_synonyms():
synonyms = ["i-pod, i pod, ipod", "sea biscuit, sea biscit, seabiscuit", "foo, foo bar, baz"]
synonym_type = "synonym_graph"
client = Elasticsearch()
client.indices.delete(index="haystack_synonym_arg", ignore=[404])
document_store = ElasticsearchDocumentStore(
index="haystack_synonym_arg", synonyms=synonyms, synonym_type=synonym_type
)
indexed_settings = client.indices.get_settings(index="haystack_synonym_arg")
assert (
synonym_type
== indexed_settings["haystack_synonym_arg"]["settings"]["index"]["analysis"]["filter"]["synonym"]["type"]
)
assert (
synonyms
== indexed_settings["haystack_synonym_arg"]["settings"]["index"]["analysis"]["filter"]["synonym"]["synonyms"]
)
@pytest.mark.parametrize(
"document_store_with_docs", ["memory", "faiss", "milvus1", "weaviate", "elasticsearch"], indirect=True
)
@ -1980,105 +1744,6 @@ def test_DeepsetCloudDocumentStore_query_without_index():
assert document_store.query(query="some query") == []
@pytest.mark.elasticsearch
def test_elasticsearch_search_field_mapping():
client = Elasticsearch()
client.indices.delete(index="haystack_search_field_mapping", ignore=[404])
index_data = [
{
"title": "Green tea components",
"meta": {
"content": "The green tea plant contains a range of healthy compounds that make it into the final drink",
"sub_content": "Drink tip",
},
"id": "1",
},
{
"title": "Green tea catechin",
"meta": {
"content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG).",
"sub_content": "Ingredients tip",
},
"id": "2",
},
{
"title": "Minerals in Green tea",
"meta": {
"content": "Green tea also has small amounts of minerals that can benefit your health.",
"sub_content": "Minerals tip",
},
"id": "3",
},
{
"title": "Green tea Benefits",
"meta": {
"content": "Green tea does more than just keep you alert, it may also help boost brain function.",
"sub_content": "Health tip",
},
"id": "4",
},
]
document_store = ElasticsearchDocumentStore(
index="haystack_search_field_mapping", search_fields=["content", "sub_content"], content_field="title"
)
document_store.write_documents(index_data)
indexed_settings = client.indices.get_mapping(index="haystack_search_field_mapping")
assert indexed_settings["haystack_search_field_mapping"]["mappings"]["properties"]["content"]["type"] == "text"
assert indexed_settings["haystack_search_field_mapping"]["mappings"]["properties"]["sub_content"]["type"] == "text"
@pytest.mark.elasticsearch
def test_elasticsearch_existing_alias():
client = Elasticsearch()
client.indices.delete(index="haystack_existing_alias_1", ignore=[404])
client.indices.delete(index="haystack_existing_alias_2", ignore=[404])
client.indices.delete_alias(index="_all", name="haystack_existing_alias", ignore=[404])
settings = {"mappings": {"properties": {"content": {"type": "text"}}}}
client.indices.create(index="haystack_existing_alias_1", body=settings)
client.indices.create(index="haystack_existing_alias_2", body=settings)
client.indices.put_alias(
index="haystack_existing_alias_1,haystack_existing_alias_2", name="haystack_existing_alias"
)
# To be valid, all indices related to the alias must have content field of type text
_ = ElasticsearchDocumentStore(index="haystack_existing_alias", search_fields=["content"])
@pytest.mark.elasticsearch
def test_elasticsearch_existing_alias_missing_fields():
client = Elasticsearch()
client.indices.delete(index="haystack_existing_alias_1", ignore=[404])
client.indices.delete(index="haystack_existing_alias_2", ignore=[404])
client.indices.delete_alias(index="_all", name="haystack_existing_alias", ignore=[404])
right_settings = {"mappings": {"properties": {"content": {"type": "text"}}}}
wrong_settings = {"mappings": {"properties": {"content": {"type": "histogram"}}}}
client.indices.create(index="haystack_existing_alias_1", body=right_settings)
client.indices.create(index="haystack_existing_alias_2", body=wrong_settings)
client.indices.put_alias(
index="haystack_existing_alias_1,haystack_existing_alias_2", name="haystack_existing_alias"
)
with pytest.raises(Exception):
# wrong field type for "content" in index "haystack_existing_alias_2"
_ = ElasticsearchDocumentStore(
index="haystack_existing_alias", search_fields=["content"], content_field="title"
)
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
def test_elasticsearch_brownfield_support(document_store_with_docs):
new_document_store = InMemoryDocumentStore()
@ -2122,9 +1787,7 @@ def test_elasticsearch_brownfield_support(document_store_with_docs):
@pytest.mark.parametrize(
"document_store",
["faiss", "milvus1", "milvus", "weaviate", "opensearch_faiss", "opensearch", "elasticsearch", "memory"],
indirect=True,
"document_store", ["faiss", "milvus1", "milvus", "weaviate", "opensearch", "elasticsearch", "memory"], indirect=True
)
def test_cosine_similarity(document_store: BaseDocumentStore):
# below we will write documents to the store and then query it to see if vectors were normalized or not
@ -2166,9 +1829,7 @@ def test_cosine_similarity(document_store: BaseDocumentStore):
@pytest.mark.parametrize(
"document_store",
["faiss", "milvus1", "milvus", "weaviate", "opensearch_faiss", "opensearch", "elasticsearch", "memory"],
indirect=True,
"document_store", ["faiss", "milvus1", "milvus", "weaviate", "opensearch", "elasticsearch", "memory"], indirect=True
)
def test_update_embeddings_cosine_similarity(document_store: BaseDocumentStore):
# below we will write documents to the store and then query it to see if vectors were normalized
@ -2228,7 +1889,7 @@ def test_update_embeddings_cosine_similarity(document_store: BaseDocumentStore):
@pytest.mark.parametrize(
"document_store_small",
["faiss", "milvus1", "milvus", "weaviate", "memory", "elasticsearch", "opensearch", "opensearch_faiss"],
["faiss", "milvus1", "milvus", "weaviate", "memory", "elasticsearch", "opensearch"],
indirect=True,
)
def test_cosine_sanity_check(document_store_small):

View File

@ -0,0 +1,225 @@
import os
import pytest
import numpy as np
from haystack.schema import Document
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
from .test_base import DocumentStoreBaseTestAbstract
from .test_search_engine import SearchEngineDocumentStoreTestAbstract
class TestElasticsearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDocumentStoreTestAbstract):
# Constants
index_name = __name__
@pytest.fixture
def ds(self):
"""
This fixture provides a working document store and takes care of removing the indices when done
"""
labels_index_name = f"{self.index_name}_labels"
ds = ElasticsearchDocumentStore(
index=self.index_name,
label_index=labels_index_name,
host=os.environ.get("ELASTICSEARCH_HOST", "localhost"),
create_index=True,
)
yield ds
ds.delete_index(self.index_name)
ds.delete_index(labels_index_name)
@pytest.mark.integration
def test___init__(self):
# defaults
_ = ElasticsearchDocumentStore()
# list of hosts + single port
_ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=9200)
# list of hosts + list of ports (wrong)
with pytest.raises(Exception):
_ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=[9200])
# list of hosts + list
_ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=[9200, 9200])
# only api_key
with pytest.raises(Exception):
_ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test")
# api_key + id
_ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test", api_key_id="test")
@pytest.mark.integration
def test_recreate_index(self, ds, documents, labels):
ds.write_documents(documents)
ds.write_labels(labels)
# Create another document store on top of the previous one
ds = ElasticsearchDocumentStore(index=ds.index, label_index=ds.label_index, recreate_index=True)
assert len(ds.get_all_documents(index=ds.index)) == 0
assert len(ds.get_all_labels(index=ds.label_index)) == 0
@pytest.mark.integration
def test_eq_filter(self, ds, documents):
ds.write_documents(documents)
filter = {"name": {"$eq": ["name_0"]}}
filtered_docs = ds.get_all_documents(filters=filter)
assert len(filtered_docs) == 3
for doc in filtered_docs:
assert doc.meta["name"] == "name_0"
filter = {"numbers": {"$eq": [2, 4]}}
filtered_docs = ds.query(query=None, filters=filter)
assert len(filtered_docs) == 3
for doc in filtered_docs:
assert doc.meta["month"] == "01"
assert doc.meta["numbers"] == [2, 4]
@pytest.mark.integration
def test_custom_fields(self, ds):
index = "haystack_test_custom"
document_store = ElasticsearchDocumentStore(
index=index,
content_field="custom_text_field",
embedding_field="custom_embedding_field",
recreate_index=True,
)
doc_to_write = {"custom_text_field": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)}
document_store.write_documents([doc_to_write])
documents = document_store.get_all_documents(return_embedding=True)
assert len(documents) == 1
assert documents[0].content == "test"
np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding)
document_store.delete_index(index)
@pytest.mark.integration
def test_query_with_filters_and_missing_embeddings(self, ds, documents):
ds.write_documents(documents)
filters = {"month": {"$in": ["01", "03"]}}
ds.skip_missing_embeddings = False
with pytest.raises(ds._RequestError):
ds.query_by_embedding(np.random.rand(768), filters=filters)
ds.skip_missing_embeddings = True
documents = ds.query_by_embedding(np.random.rand(768), filters=filters)
assert len(documents) == 3
@pytest.mark.integration
def test_synonyms(self, ds):
synonyms = ["i-pod, i pod, ipod", "sea biscuit, sea biscit, seabiscuit", "foo, foo bar, baz"]
synonym_type = "synonym_graph"
client = ds.client
index = "haystack_synonym_arg"
client.indices.delete(index=index, ignore=[404])
ElasticsearchDocumentStore(index=index, synonyms=synonyms, synonym_type=synonym_type)
indexed_settings = client.indices.get_settings(index=index)
assert synonym_type == indexed_settings[index]["settings"]["index"]["analysis"]["filter"]["synonym"]["type"]
assert synonyms == indexed_settings[index]["settings"]["index"]["analysis"]["filter"]["synonym"]["synonyms"]
@pytest.mark.integration
def test_search_field_mapping(self):
index = "haystack_search_field_mapping"
document_store = ElasticsearchDocumentStore(
index=index, search_fields=["content", "sub_content"], content_field="title"
)
document_store.write_documents(
[
{
"title": "Green tea components",
"meta": {
"content": "The green tea plant contains a range of healthy compounds that make it into the final drink",
"sub_content": "Drink tip",
},
"id": "1",
},
{
"title": "Green tea catechin",
"meta": {
"content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG).",
"sub_content": "Ingredients tip",
},
"id": "2",
},
{
"title": "Minerals in Green tea",
"meta": {
"content": "Green tea also has small amounts of minerals that can benefit your health.",
"sub_content": "Minerals tip",
},
"id": "3",
},
{
"title": "Green tea Benefits",
"meta": {
"content": "Green tea does more than just keep you alert, it may also help boost brain function.",
"sub_content": "Health tip",
},
"id": "4",
},
]
)
indexed_settings = document_store.client.indices.get_mapping(index=index)
assert indexed_settings[index]["mappings"]["properties"]["content"]["type"] == "text"
assert indexed_settings[index]["mappings"]["properties"]["sub_content"]["type"] == "text"
document_store.delete_index(index)
@pytest.mark.integration
def test_existing_alias(self, ds):
client = ds.client
client.indices.delete(index="haystack_existing_alias_1", ignore=[404])
client.indices.delete(index="haystack_existing_alias_2", ignore=[404])
client.indices.delete_alias(index="_all", name="haystack_existing_alias", ignore=[404])
settings = {"mappings": {"properties": {"content": {"type": "text"}}}}
client.indices.create(index="haystack_existing_alias_1", body=settings)
client.indices.create(index="haystack_existing_alias_2", body=settings)
client.indices.put_alias(
index="haystack_existing_alias_1,haystack_existing_alias_2", name="haystack_existing_alias"
)
# To be valid, all indices related to the alias must have content field of type text
ElasticsearchDocumentStore(index="haystack_existing_alias", search_fields=["content"])
@pytest.mark.integration
def test_existing_alias_missing_fields(self, ds):
client = ds.client
client.indices.delete(index="haystack_existing_alias_1", ignore=[404])
client.indices.delete(index="haystack_existing_alias_2", ignore=[404])
client.indices.delete_alias(index="_all", name="haystack_existing_alias", ignore=[404])
right_settings = {"mappings": {"properties": {"content": {"type": "text"}}}}
wrong_settings = {"mappings": {"properties": {"content": {"type": "histogram"}}}}
client.indices.create(index="haystack_existing_alias_1", body=right_settings)
client.indices.create(index="haystack_existing_alias_2", body=wrong_settings)
client.indices.put_alias(
index="haystack_existing_alias_1,haystack_existing_alias_2", name="haystack_existing_alias"
)
with pytest.raises(Exception):
# wrong field type for "content" in index "haystack_existing_alias_2"
ElasticsearchDocumentStore(
index="haystack_existing_alias", search_fields=["content"], content_field="title"
)
@pytest.mark.integration
def test_get_document_count_only_documents_without_embedding_arg(self, ds, documents):
ds.write_documents(documents)
assert ds.get_document_count() == 9
assert ds.get_document_count(only_documents_without_embedding=True) == 3
assert ds.get_document_count(only_documents_without_embedding=True, filters={"month": ["01"]}) == 0
assert ds.get_document_count(only_documents_without_embedding=True, filters={"month": ["03"]}) == 3

View File

@ -1,3 +1,4 @@
import os
import logging
from unittest.mock import MagicMock, patch
@ -19,15 +20,16 @@ from haystack.document_stores.opensearch import (
from haystack.schema import Document, Label, Answer
from haystack.errors import DocumentStoreError
# Being all the tests in this module, ideally we wouldn't need a marker here,
# but this is to allow this test suite to be skipped when running (e.g.)
# `pytest test/document_stores --document-store-type=faiss`
class TestOpenSearchDocumentStore:
from .test_base import DocumentStoreBaseTestAbstract
from .test_search_engine import SearchEngineDocumentStoreTestAbstract
class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDocumentStoreTestAbstract):
# Constants
query_emb = np.random.random_sample(size=(2, 2))
index_name = "myindex"
index_name = __name__
# Fixtures
@ -36,11 +38,15 @@ class TestOpenSearchDocumentStore:
"""
This fixture provides a working document store and takes care of removing the indices when done
"""
index_name = __name__
labels_index_name = f"{index_name}_labels"
ds = OpenSearchDocumentStore(index=index_name, label_index=labels_index_name, port=9201, create_index=True)
labels_index_name = f"{self.index_name}_labels"
ds = OpenSearchDocumentStore(
index=self.index_name,
label_index=labels_index_name,
host=os.environ.get("OPENSEARCH_HOST", "localhost"),
create_index=True,
)
yield ds
ds.delete_index(index_name)
ds.delete_index(self.index_name)
ds.delete_index(labels_index_name)
@pytest.fixture
@ -82,35 +88,6 @@ class TestOpenSearchDocumentStore:
"use_system_proxy": True,
}
@pytest.fixture
def documents(self):
documents = []
for i in range(3):
documents.append(
Document(
content=f"A Foo Document {i}",
meta={"name": f"name_{i}", "year": "2020", "month": "01"},
embedding=np.random.rand(768).astype(np.float32),
)
)
documents.append(
Document(
content=f"A Bar Document {i}",
meta={"name": f"name_{i}", "year": "2021", "month": "02"},
embedding=np.random.rand(768).astype(np.float32),
)
)
documents.append(
Document(
content=f"Document {i} without embeddings",
meta={"name": f"name_{i}", "no_embedding": True, "month": "03"},
)
)
return documents
@pytest.fixture
def index(self):
return {
@ -143,46 +120,15 @@ class TestOpenSearchDocumentStore:
},
}
@pytest.fixture
def labels(self, documents):
labels = []
for i, d in enumerate(documents):
labels.append(
Label(
query="query",
document=d,
is_correct_document=True,
is_correct_answer=False,
# create a mix set of labels
origin="user-feedback" if i % 2 else "gold-label",
answer=None if not i else Answer(f"the answer is {i}"),
)
)
return labels
# Integration tests
@pytest.mark.integration
def test___init__(self):
OpenSearchDocumentStore(index="default_index", port=9201, create_index=True)
OpenSearchDocumentStore(index="default_index", create_index=True)
@pytest.mark.integration
def test___init___faiss(self):
OpenSearchDocumentStore(index="faiss_index", port=9201, create_index=True, knn_engine="faiss")
@pytest.mark.integration
def test_write_documents(self, ds, documents):
ds.write_documents(documents)
docs = ds.get_all_documents()
assert len(docs) == len(documents)
for i, doc in enumerate(docs):
expected = documents[i]
assert doc.id == expected.id
@pytest.mark.integration
def test_write_labels(self, ds, labels):
ds.write_labels(labels)
assert ds.get_all_labels() == labels
OpenSearchDocumentStore(index="faiss_index", create_index=True, knn_engine="faiss")
@pytest.mark.integration
def test_recreate_index(self, ds, documents, labels):
@ -190,7 +136,7 @@ class TestOpenSearchDocumentStore:
ds.write_labels(labels)
# Create another document store on top of the previous one
ds = OpenSearchDocumentStore(index=ds.index, label_index=ds.label_index, recreate_index=True, port=9201)
ds = OpenSearchDocumentStore(index=ds.index, label_index=ds.label_index, recreate_index=True)
assert len(ds.get_all_documents(index=ds.index)) == 0
assert len(ds.get_all_labels(index=ds.label_index)) == 0
@ -213,7 +159,7 @@ class TestOpenSearchDocumentStore:
assert ds.embeddings_field_supports_similarity == True
index_name = ds.index
with caplog.at_level(logging.WARNING):
ds = OpenSearchDocumentStore(port=9201, knn_engine="faiss", index=index_name)
ds = OpenSearchDocumentStore(knn_engine="faiss", index=index_name)
warning = (
"Embedding field 'embedding' was initially created with knn_engine 'nmslib', but knn_engine was "
"set to 'faiss' when initializing OpenSearchDocumentStore. Falling back to slow exact vector "

View File

@ -0,0 +1,58 @@
import pytest
from haystack.document_stores.search_engine import SearchEngineDocumentStore, prepare_hosts
@pytest.mark.unit
def test_prepare_hosts():
pass
@pytest.mark.document_store
class SearchEngineDocumentStoreTestAbstract:
"""
This is the base class for any Searchengine Document Store testsuite, it doesn't have the `Test` prefix in the name
because we want to run its methods only in subclasses.
"""
@pytest.mark.integration
def test___do_bulk(self):
pass
@pytest.mark.integration
def test___do_scan(self):
pass
@pytest.mark.integration
def test_query_by_embedding(self):
pass
@pytest.mark.integration
def test_get_meta_values_by_key(self, ds, documents):
ds.write_documents(documents)
# test without filters or query
result = ds.get_metadata_values_by_key(key="name")
assert result == [
{"count": 3, "value": "name_0"},
{"count": 3, "value": "name_1"},
{"count": 3, "value": "name_2"},
]
# test with filters but no query
result = ds.get_metadata_values_by_key(key="year", filters={"month": ["01"]})
assert result == [{"count": 3, "value": "2020"}]
# test with filters & query
result = ds.get_metadata_values_by_key(key="year", query="Bar")
assert result == [{"count": 3, "value": "2021"}]
@pytest.mark.document_store
class TestSearchEngineDocumentStore:
"""
This class tests the concrete methods in SearchEngineDocumentStore
"""
@pytest.mark.integration
def test__split_document_list(self):
pass