diff --git a/.github/workflows/ci.yml b/.github/workflows/linux_ci.yml similarity index 90% rename from .github/workflows/ci.yml rename to .github/workflows/linux_ci.yml index 7f2cba977..4d78e9051 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/linux_ci.yml @@ -1,4 +1,4 @@ -name: Build +name: Linux CI on: push: @@ -33,7 +33,7 @@ jobs: uses: actions/cache@v2 with: path: ${{ env.pythonLocation }} - key: ${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }} + key: linux-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }} - name: Install dependencies if: steps.cache-python-env.outputs.cache-hit != 'true' run: | @@ -70,7 +70,7 @@ jobs: uses: actions/cache@v2 with: path: ${{ env.pythonLocation }} - key: ${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }} + key: linux-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }} - name: Run Elasticsearch run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx128m" elasticsearch:7.9.2 diff --git a/.github/workflows/windows_ci.yml b/.github/workflows/windows_ci.yml new file mode 100644 index 000000000..aec550632 --- /dev/null +++ b/.github/workflows/windows_ci.yml @@ -0,0 +1,97 @@ +name: Windows CI + +on: + push: + branches: [ master ] +# pull_request: +# branches: [ master ] + +jobs: + type-check: + runs-on: windows-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: Test with mypy + run: | + pip install mypy types-Markdown types-requests types-PyYAML pydantic + mypy haystack + + build-cache: + needs: type-check + runs-on: windows-latest + + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: 3.7 + - run: echo "date=$(date +'%Y-%m-%d')" >> $env:GITHUB_ENV + - name: Cache + id: cache-python-env + uses: actions/cache@v2 + with: + path: ${{ env.pythonLocation }} + key: windows-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }} + - name: Install Pytorch on windows + run: | + pip install torch==1.8.1+cpu -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html + - name: Install dependencies + if: steps.cache-python-env.outputs.cache-hit != 'true' + run: | + python -m pip install --upgrade pip + pip install --upgrade --upgrade-strategy eager -r requirements-dev.txt -e . + pip install --upgrade --upgrade-strategy eager -f https://download.pytorch.org/whl/torch_stable.html -r requirements.txt -e . + pip install torch-scatter -f https://data.pyg.org/whl/torch-1.9.0+cpu.html + + prepare-build: + needs: build-cache + # With Windows it gives error, also this step only listing test files only + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - id: set-matrix + run: | + echo "::set-output name=matrix::$(cd test && ls -d test_*.py | jq -R . | jq -cs .)" + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + + build: + needs: prepare-build + runs-on: windows-latest + strategy: + matrix: + test-path: ${{fromJson(needs.prepare-build.outputs.matrix)}} + fail-fast: false + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + - run: echo "date=$(date +'%Y-%m-%d')" >> $env:GITHUB_ENV + - name: Cache + uses: actions/cache@v2 + with: + path: ${{ env.pythonLocation }} + key: windows-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }} + + # Windows runner can't run Linux containers. Refer https://github.com/actions/virtual-environments/issues/1143 + - name: Set up Windows test env + run: | + choco install xpdf-utils + choco install openjdk11 + refreshenv + choco install tesseract --pre + choco install elasticsearch --version=7.9.2 + refreshenv + Get-Service elasticsearch-service-x64 | Start-Service + + # We have to remove files if not test going to run from it + # As on windows we are going to disable quite a few tests these, hence these files will throw error refer https://github.com/pytest-dev/pytest/issues/812 + # Removing test_ray, test_utils, test_preprocessor, test_knowledge_graph and test_connector + - name: Run tests + if: ${{ !contains(fromJSON('["test_ray.py", "test_knowledge_graph.py", "test_connector.py"]'), matrix.test-path) }} + run: cd test && pytest --document_store_type=memory,faiss,elasticsearch -m "not tika and not graphdb" -s ${{ matrix.test-path }} diff --git a/haystack/utils/preprocessing.py b/haystack/utils/preprocessing.py index cce32aa2f..4f8a3305e 100644 --- a/haystack/utils/preprocessing.py +++ b/haystack/utils/preprocessing.py @@ -8,7 +8,6 @@ from haystack.nodes.file_converter import ( BaseConverter, DocxToTextConverter, PDFToTextConverter, - TikaConverter, TextConverter ) @@ -99,6 +98,11 @@ def tika_convert_files_to_dicts( :param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str) :param split_paragraphs: split text in paragraphs. """ + try: + from haystack.nodes.file_converter import TikaConverter + except Exception as ex: + logger.error("Tika not installed. Please install tika and try again. Error: {}".format(ex)) + raise ex converter = TikaConverter() paths = [p for p in Path(dir_path).glob("**/*")] allowed_suffixes = [".pdf", ".txt"] diff --git a/test/conftest.py b/test/conftest.py index 53070e37c..a10705cf0 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -224,7 +224,7 @@ def tika_fixture(): @pytest.fixture(scope="session") -def xpdf_fixture(tika_fixture): +def xpdf_fixture(): verify_installation = run(["pdftotext"], shell=True) if verify_installation.returncode == 127: if platform.startswith("linux"): diff --git a/test/test_document_store.py b/test/test_document_store.py index e30dd6519..7d6152416 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -68,6 +68,7 @@ def test_write_with_duplicate_doc_ids_custom_index(document_store): # writing to the default, empty index should still work document_store.write_documents(documents, duplicate_documents="fail") + def test_get_all_documents_without_filters(document_store_with_docs): documents = document_store_with_docs.get_all_documents() assert all(isinstance(d, Document) for d in documents) @@ -812,7 +813,7 @@ def test_get_meta_values_by_key(document_store): @pytest.mark.elasticsearch -def test_elasticsearch_custom_fields(elasticsearch_fixture): +def test_elasticsearch_custom_fields(): client = Elasticsearch() client.indices.delete(index='haystack_test_custom', ignore=[404]) document_store = ElasticsearchDocumentStore(index="haystack_test_custom", content_field="custom_text_field", diff --git a/test/test_faiss_and_milvus.py b/test/test_faiss_and_milvus.py index 811c429d4..3b75deaff 100644 --- a/test/test_faiss_and_milvus.py +++ b/test/test_faiss_and_milvus.py @@ -1,4 +1,5 @@ -import time +import sys + import faiss import math import numpy as np @@ -19,6 +20,7 @@ DOCUMENTS = [ ] +@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner") def test_faiss_index_save_and_load(tmp_path): document_store = FAISSDocumentStore( sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}", @@ -47,6 +49,7 @@ def test_faiss_index_save_and_load(tmp_path): assert not new_document_store.progress_bar +@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner") def test_faiss_index_save_and_load_custom_path(tmp_path): document_store = FAISSDocumentStore( sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}", @@ -95,7 +98,7 @@ def test_faiss_write_docs(document_store, index_buffer_size, batch_size): stored_emb = document_store.faiss_indexes[document_store.index].reconstruct(int(doc.meta["vector_id"])) # compare original input vec with stored one (ignore extra dim added by hnsw) assert np.allclose(original_doc["embedding"], stored_emb, rtol=0.01) - + @pytest.mark.slow @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) @@ -158,6 +161,7 @@ def test_update_with_empty_store(document_store, retriever): assert len(documents_indexed) == len(DOCUMENTS) +@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner") @pytest.mark.parametrize("index_factory", ["Flat", "HNSW", "IVF1,Flat"]) def test_faiss_retrieving(index_factory, tmp_path): document_store = FAISSDocumentStore( @@ -253,7 +257,7 @@ def test_delete_docs_by_id_with_filters(document_store, retriever): all_ids_left = [doc.id for doc in documents] assert all(doc_id in all_ids_left for doc_id in ids_not_to_delete) - + @pytest.mark.parametrize("retriever", ["embedding"], indirect=True) @pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True) @@ -271,6 +275,7 @@ def test_pipeline(document_store, retriever): assert len(output["documents"]) == 3 +@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner") def test_faiss_passing_index_from_outside(tmp_path): d = 768 nlist = 2 @@ -295,6 +300,7 @@ def test_faiss_passing_index_from_outside(tmp_path): assert 0 <= int(doc.meta["vector_id"]) <= 7 +@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner") def test_faiss_cosine_similarity(tmp_path): document_store = FAISSDocumentStore( sql_url=f"sqlite:////{tmp_path/'haystack_test_faiss.db'}", similarity='cosine' @@ -322,7 +328,7 @@ def test_faiss_cosine_similarity(tmp_path): # check if the stored embedding was normalized assert np.allclose(original_emb[0], result_emb, rtol=0.01) - + # check if the score is plausible for cosine similarity assert 0 <= doc.score <= 1.0 @@ -342,7 +348,7 @@ def test_faiss_cosine_similarity(tmp_path): assert not np.allclose(original_emb[0], doc.embedding, rtol=0.01) - +@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner") def test_faiss_cosine_sanity_check(tmp_path): document_store = FAISSDocumentStore( sql_url=f"sqlite:////{tmp_path/'haystack_test_faiss.db'}", similarity='cosine', diff --git a/test/test_file_converter.py b/test/test_file_converter.py index f163c489b..79c9a7964 100644 --- a/test/test_file_converter.py +++ b/test/test_file_converter.py @@ -13,7 +13,7 @@ from haystack.file_converter.tika import TikaConverter # "Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter] "Converter", [PDFToTextOCRConverter] ) -def test_convert(Converter, xpdf_fixture): +def test_convert(Converter): converter = Converter() document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) pages = document["content"].split("\f") @@ -31,7 +31,7 @@ def test_convert(Converter, xpdf_fixture): @pytest.mark.tika @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter]) -def test_table_removal(Converter, xpdf_fixture): +def test_table_removal(Converter): converter = Converter(remove_numeric_tables=True) document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) pages = document["content"].split("\f") @@ -42,7 +42,7 @@ def test_table_removal(Converter, xpdf_fixture): @pytest.mark.tika @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter]) -def test_language_validation(Converter, xpdf_fixture, caplog): +def test_language_validation(Converter, caplog): converter = Converter(valid_languages=["en"]) converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) assert ( diff --git a/test/test_generator.py b/test/test_generator.py index e98470a75..8b44d7681 100644 --- a/test/test_generator.py +++ b/test/test_generator.py @@ -1,3 +1,4 @@ +import sys from typing import List import numpy as np @@ -428,6 +429,7 @@ def test_generator_pipeline(document_store, retriever, rag_generator): assert "berlin" in answers[0]["answer"] +@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Gives memory allocation error on windows runner") @pytest.mark.slow @pytest.mark.generator @pytest.mark.parametrize("document_store", ["memory"], indirect=True) diff --git a/test/test_knowledge_graph.py b/test/test_knowledge_graph.py index b21dc5c73..5f3ca1007 100644 --- a/test/test_knowledge_graph.py +++ b/test/test_knowledge_graph.py @@ -6,8 +6,9 @@ from haystack.nodes import Text2SparqlRetriever from haystack.document_stores import GraphDBKnowledgeGraph from haystack.utils import fetch_archive_from_http + @pytest.mark.graphdb -def test_graph_retrieval(graphdb_fixture): +def test_graph_retrieval(): # TODO rename doc_dir graph_dir = "../data/tutorial10_knowledge_graph/" s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip" diff --git a/test/test_preprocessor.py b/test/test_preprocessor.py index 64b6eb361..b51f9b258 100644 --- a/test/test_preprocessor.py +++ b/test/test_preprocessor.py @@ -18,7 +18,6 @@ in the sentence. """ -@pytest.mark.tika def test_preprocess_sentence_split(): document = {"content": TEXT} preprocessor = PreProcessor(split_length=1, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False) @@ -32,7 +31,6 @@ def test_preprocess_sentence_split(): assert len(documents) == 2 -@pytest.mark.tika def test_preprocess_word_split(): document = {"content": TEXT} preprocessor = PreProcessor(split_length=10, split_overlap=0, split_by="word", split_respect_sentence_boundary=False) @@ -56,7 +54,6 @@ def test_preprocess_word_split(): assert len(documents) == 15 -@pytest.mark.tika def test_preprocess_passage_split(): document = {"content": TEXT} preprocessor = PreProcessor(split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False) @@ -68,7 +65,6 @@ def test_preprocess_passage_split(): assert len(documents) == 2 -@pytest.mark.tika def test_clean_header_footer(): converter = PDFToTextConverter() document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header/footer diff --git a/test/test_retriever.py b/test/test_retriever.py index 7ecce05a6..da99e8a1f 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -94,7 +94,7 @@ def test_retrieval(retriever_with_docs, document_store_with_docs): @pytest.mark.elasticsearch -def test_elasticsearch_custom_query(elasticsearch_fixture): +def test_elasticsearch_custom_query(): client = Elasticsearch() client.indices.delete(index="haystack_test_custom", ignore=[404]) document_store = ElasticsearchDocumentStore( diff --git a/test/test_utils.py b/test/test_utils.py index f4ad36860..7320437a8 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,17 +1,16 @@ import pytest -from haystack.preprocessor.utils import convert_files_to_dicts, tika_convert_files_to_dicts -from haystack.preprocessor.cleaning import clean_wiki_text +from haystack.utils.preprocessing import convert_files_to_dicts, tika_convert_files_to_dicts +from haystack.utils.cleaning import clean_wiki_text -@pytest.mark.tika -def test_convert_files_to_dicts(xpdf_fixture): +def test_convert_files_to_dicts(): documents = convert_files_to_dicts(dir_path="samples", clean_func=clean_wiki_text, split_paragraphs=True) assert documents and len(documents) > 0 @pytest.mark.tika -def test_tika_convert_files_to_dicts(tika_fixture): +def test_tika_convert_files_to_dicts(): documents = tika_convert_files_to_dicts(dir_path="samples", clean_func=clean_wiki_text, split_paragraphs=True) assert documents and len(documents) > 0