Add CI for windows runner (#1458)

* Feat: Removing use of temp file while downloading archive from url along with adding CI for windows and mac platform * Windows CI by default installing pytorch gpu hence updating CI to pick cpu version * fixing mac cache build issue * updating windows pip install command for torch * another attempt * updating ci * Adding sudo * fixing ls failure on windows * another attempt to fix build issue * Saving env variable of test files * Adding debug log * Github action differ on windows * adding debug * anohter attempt * Windows have different ways to receive env * fixing template * minor fx * Adding debug * Removing use of json * Adding back fromJson * addin toJson * removing print * anohter attempt * disabling parallel run at least for testing * installing docker for mac runner * correcting docker install command * Linux dockers are not suported in windows * Removing mac changes * Upgrading pytorch * using lts pytorch * Separating win and ubuntu * Install java 11 * enabling linux container env * docker cli command * docker cli command * start elastic service * List all service * correcting service name * Attempt to fix multiple test run * convert to json * another attempt to check * Updating build cache step * attempt * Add tika * Separating windows CI * Changing CI name * Skipping test which does not work in windows * Skipping tests for windows * create cleanup function in conftest * adding skipif marker on tests * Run windows PR on only push to master * Addressing review comments * Enabling windows ci for this PR * Tika init is being called when importing tika function * handling tika import issue * handling tika import issue in test * Fixing import issue * removing tika fixure * Removing fixture from tests * Disable windows ci on pull request * Add back extra pytorch install step Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
2026-01-08 13:06:29 +00:00 · 2021-10-29 13:52:28 +05:30 · 2021-10-29 13:52:28 +05:30 · e5b4b62d75
commit e5b4b62d75
parent 08341f5698
12 changed files with 131 additions and 25 deletions
--- a/.github/workflows/linux_ci.yml
+++ b/.github/workflows/linux_ci.yml
@ -1,4 +1,4 @@
-name: Build
+name: Linux CI

 on:
  push:
@ -33,7 +33,7 @@ jobs:
        uses: actions/cache@v2
        with:
          path: ${{ env.pythonLocation }}
-          key: ${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
+          key: linux-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
      - name: Install dependencies
        if: steps.cache-python-env.outputs.cache-hit != 'true'
        run: |
@ -70,7 +70,7 @@ jobs:
      uses: actions/cache@v2
      with:
        path: ${{ env.pythonLocation }}
-        key: ${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
+        key: linux-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
    - name: Run Elasticsearch
      run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx128m" elasticsearch:7.9.2

--- a/.github/workflows/windows_ci.yml
+++ b/.github/workflows/windows_ci.yml
@ -0,0 +1,97 @@
+name: Windows CI
+
+on:
+  push:
+    branches: [ master ]
+#  pull_request:
+#    branches: [ master ]
+
+jobs:
+  type-check:
+    runs-on: windows-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+      - name: Test with mypy
+        run: |
+          pip install mypy types-Markdown types-requests types-PyYAML pydantic
+          mypy haystack
+
+  build-cache:
+    needs: type-check
+    runs-on: windows-latest
+
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - run: echo "date=$(date +'%Y-%m-%d')" >> $env:GITHUB_ENV
+      - name: Cache
+        id: cache-python-env
+        uses: actions/cache@v2
+        with:
+          path: ${{ env.pythonLocation }}
+          key: windows-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
+      - name: Install Pytorch on windows
+        run: |
+          pip install torch==1.8.1+cpu -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+      - name: Install dependencies
+        if: steps.cache-python-env.outputs.cache-hit != 'true'
+        run: |
+          python -m pip install --upgrade pip
+          pip install --upgrade --upgrade-strategy eager -r requirements-dev.txt -e .
+          pip install --upgrade --upgrade-strategy eager -f https://download.pytorch.org/whl/torch_stable.html -r requirements.txt -e .
+          pip install torch-scatter -f https://data.pyg.org/whl/torch-1.9.0+cpu.html
+
+  prepare-build:
+    needs: build-cache
+    # With Windows it gives error, also this step only listing test files only
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v2
+      - id: set-matrix
+        run: |
+          echo "::set-output name=matrix::$(cd test && ls -d test_*.py | jq -R . | jq -cs .)"
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+
+  build:
+    needs: prepare-build
+    runs-on: windows-latest
+    strategy:
+      matrix:
+        test-path: ${{fromJson(needs.prepare-build.outputs.matrix)}}
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python 3.7
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.7
+    - run: echo "date=$(date +'%Y-%m-%d')" >> $env:GITHUB_ENV
+    - name: Cache
+      uses: actions/cache@v2
+      with:
+        path: ${{ env.pythonLocation }}
+        key: windows-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
+
+    # Windows runner can't run Linux containers. Refer https://github.com/actions/virtual-environments/issues/1143
+    - name: Set up Windows test env
+      run: |
+        choco install xpdf-utils
+        choco install openjdk11
+        refreshenv
+        choco install tesseract --pre
+        choco install elasticsearch --version=7.9.2
+        refreshenv
+        Get-Service elasticsearch-service-x64 | Start-Service
+
+    # We have to remove files if not test going to run from it
+    # As on windows we are going to disable quite a few tests these, hence these files will throw error refer https://github.com/pytest-dev/pytest/issues/812
+    # Removing test_ray, test_utils, test_preprocessor, test_knowledge_graph and test_connector
+    - name: Run tests
+      if: ${{ !contains(fromJSON('["test_ray.py", "test_knowledge_graph.py", "test_connector.py"]'), matrix.test-path) }}
+      run: cd test && pytest --document_store_type=memory,faiss,elasticsearch -m "not tika and not graphdb" -s ${{ matrix.test-path }}
--- a/haystack/utils/preprocessing.py
+++ b/haystack/utils/preprocessing.py
@ -8,7 +8,6 @@ from haystack.nodes.file_converter import (
    BaseConverter, 
    DocxToTextConverter,
    PDFToTextConverter,
-    TikaConverter,
    TextConverter
 )

@ -99,6 +98,11 @@ def tika_convert_files_to_dicts(
    :param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
    :param split_paragraphs: split text in paragraphs.
    """
+    try:
+        from haystack.nodes.file_converter import TikaConverter
+    except Exception as ex:
+        logger.error("Tika not installed. Please install tika and try again. Error: {}".format(ex))
+        raise ex
    converter = TikaConverter()
    paths = [p for p in Path(dir_path).glob("**/*")]
    allowed_suffixes = [".pdf", ".txt"]
--- a/test/conftest.py
+++ b/test/conftest.py
@ -224,7 +224,7 @@ def tika_fixture():


@pytest.fixture(scope="session")
-def xpdf_fixture(tika_fixture):
+def xpdf_fixture():
    verify_installation = run(["pdftotext"], shell=True)
    if verify_installation.returncode == 127:
        if platform.startswith("linux"):
--- a/test/test_document_store.py
+++ b/test/test_document_store.py
@ -68,6 +68,7 @@ def test_write_with_duplicate_doc_ids_custom_index(document_store):
    # writing to the default, empty index should still work
    document_store.write_documents(documents, duplicate_documents="fail")

+
 def test_get_all_documents_without_filters(document_store_with_docs):
    documents = document_store_with_docs.get_all_documents()
    assert all(isinstance(d, Document) for d in documents)
@ -812,7 +813,7 @@ def test_get_meta_values_by_key(document_store):


@pytest.mark.elasticsearch
-def test_elasticsearch_custom_fields(elasticsearch_fixture):
+def test_elasticsearch_custom_fields():
    client = Elasticsearch()
    client.indices.delete(index='haystack_test_custom', ignore=[404])
    document_store = ElasticsearchDocumentStore(index="haystack_test_custom", content_field="custom_text_field",
--- a/test/test_faiss_and_milvus.py
+++ b/test/test_faiss_and_milvus.py
@ -1,4 +1,5 @@
-import time
+import sys
+
 import faiss
 import math
 import numpy as np
@ -19,6 +20,7 @@ DOCUMENTS = [
 ]


+@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
 def test_faiss_index_save_and_load(tmp_path):
    document_store = FAISSDocumentStore(
        sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}",
@ -47,6 +49,7 @@ def test_faiss_index_save_and_load(tmp_path):
    assert not new_document_store.progress_bar


+@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
 def test_faiss_index_save_and_load_custom_path(tmp_path):
    document_store = FAISSDocumentStore(
        sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}",
@ -95,7 +98,7 @@ def test_faiss_write_docs(document_store, index_buffer_size, batch_size):
        stored_emb = document_store.faiss_indexes[document_store.index].reconstruct(int(doc.meta["vector_id"]))
        # compare original input vec with stored one (ignore extra dim added by hnsw)
        assert np.allclose(original_doc["embedding"], stored_emb, rtol=0.01)
-        
+

@pytest.mark.slow
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@ -158,6 +161,7 @@ def test_update_with_empty_store(document_store, retriever):
    assert len(documents_indexed) == len(DOCUMENTS)


+@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
@pytest.mark.parametrize("index_factory", ["Flat", "HNSW", "IVF1,Flat"])
 def test_faiss_retrieving(index_factory, tmp_path):
    document_store = FAISSDocumentStore(
@ -253,7 +257,7 @@ def test_delete_docs_by_id_with_filters(document_store, retriever):
    all_ids_left = [doc.id for doc in documents]
    assert all(doc_id in all_ids_left for doc_id in ids_not_to_delete)

- 
+

@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True)
@ -271,6 +275,7 @@ def test_pipeline(document_store, retriever):
    assert len(output["documents"]) == 3


+@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
 def test_faiss_passing_index_from_outside(tmp_path):
    d = 768
    nlist = 2
@ -295,6 +300,7 @@ def test_faiss_passing_index_from_outside(tmp_path):
        assert 0 <= int(doc.meta["vector_id"]) <= 7


+@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
 def test_faiss_cosine_similarity(tmp_path):
    document_store = FAISSDocumentStore(
        sql_url=f"sqlite:////{tmp_path/'haystack_test_faiss.db'}", similarity='cosine'
@ -322,7 +328,7 @@ def test_faiss_cosine_similarity(tmp_path):

        # check if the stored embedding was normalized
        assert np.allclose(original_emb[0], result_emb, rtol=0.01)
-        
+
        # check if the score is plausible for cosine similarity
        assert 0 <= doc.score <= 1.0

@ -342,7 +348,7 @@ def test_faiss_cosine_similarity(tmp_path):
        assert not np.allclose(original_emb[0], doc.embedding, rtol=0.01)


-
+@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
 def test_faiss_cosine_sanity_check(tmp_path):
    document_store = FAISSDocumentStore(
        sql_url=f"sqlite:////{tmp_path/'haystack_test_faiss.db'}", similarity='cosine',
--- a/test/test_file_converter.py
+++ b/test/test_file_converter.py
@ -13,7 +13,7 @@ from haystack.file_converter.tika import TikaConverter
    # "Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter]
    "Converter", [PDFToTextOCRConverter]
 )
-def test_convert(Converter, xpdf_fixture):
+def test_convert(Converter):
    converter = Converter()
    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
    pages = document["content"].split("\f")
@ -31,7 +31,7 @@ def test_convert(Converter, xpdf_fixture):

@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
-def test_table_removal(Converter, xpdf_fixture):
+def test_table_removal(Converter):
    converter = Converter(remove_numeric_tables=True)
    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
    pages = document["content"].split("\f")
@ -42,7 +42,7 @@ def test_table_removal(Converter, xpdf_fixture):

@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
-def test_language_validation(Converter, xpdf_fixture, caplog):
+def test_language_validation(Converter, caplog):
    converter = Converter(valid_languages=["en"])
    converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
    assert (
--- a/test/test_generator.py
+++ b/test/test_generator.py
@ -1,3 +1,4 @@
+import sys
 from typing import List

 import numpy as np
@ -428,6 +429,7 @@ def test_generator_pipeline(document_store, retriever, rag_generator):
    assert "berlin" in answers[0]["answer"]


+@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Gives memory allocation error on windows runner")
@pytest.mark.slow
@pytest.mark.generator
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
--- a/test/test_knowledge_graph.py
+++ b/test/test_knowledge_graph.py
@ -6,8 +6,9 @@ from haystack.nodes import Text2SparqlRetriever
 from haystack.document_stores import GraphDBKnowledgeGraph
 from haystack.utils import fetch_archive_from_http

+
@pytest.mark.graphdb
-def test_graph_retrieval(graphdb_fixture):
+def test_graph_retrieval():
    # TODO rename doc_dir
    graph_dir = "../data/tutorial10_knowledge_graph/"
    s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip"
--- a/test/test_preprocessor.py
+++ b/test/test_preprocessor.py
@ -18,7 +18,6 @@ in the sentence.
 """


-@pytest.mark.tika
 def test_preprocess_sentence_split():
    document = {"content": TEXT}
    preprocessor = PreProcessor(split_length=1, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False)
@ -32,7 +31,6 @@ def test_preprocess_sentence_split():
    assert len(documents) == 2


-@pytest.mark.tika
 def test_preprocess_word_split():
    document = {"content": TEXT}
    preprocessor = PreProcessor(split_length=10, split_overlap=0, split_by="word", split_respect_sentence_boundary=False)
@ -56,7 +54,6 @@ def test_preprocess_word_split():
    assert len(documents) == 15


-@pytest.mark.tika
 def test_preprocess_passage_split():
    document = {"content": TEXT}
    preprocessor = PreProcessor(split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False)
@ -68,7 +65,6 @@ def test_preprocess_passage_split():
    assert len(documents) == 2


-@pytest.mark.tika
 def test_clean_header_footer():
    converter = PDFToTextConverter()
    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf"))  # file contains header/footer
--- a/test/test_retriever.py
+++ b/test/test_retriever.py
@ -94,7 +94,7 @@ def test_retrieval(retriever_with_docs, document_store_with_docs):


@pytest.mark.elasticsearch
-def test_elasticsearch_custom_query(elasticsearch_fixture):
+def test_elasticsearch_custom_query():
    client = Elasticsearch()
    client.indices.delete(index="haystack_test_custom", ignore=[404])
    document_store = ElasticsearchDocumentStore(
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -1,17 +1,16 @@
 import pytest

-from haystack.preprocessor.utils import convert_files_to_dicts, tika_convert_files_to_dicts
-from haystack.preprocessor.cleaning import clean_wiki_text
+from haystack.utils.preprocessing import convert_files_to_dicts, tika_convert_files_to_dicts
+from haystack.utils.cleaning import clean_wiki_text


-@pytest.mark.tika
-def test_convert_files_to_dicts(xpdf_fixture):
+def test_convert_files_to_dicts():
    documents = convert_files_to_dicts(dir_path="samples", clean_func=clean_wiki_text, split_paragraphs=True)
    assert documents and len(documents) > 0


@pytest.mark.tika
-def test_tika_convert_files_to_dicts(tika_fixture):
+def test_tika_convert_files_to_dicts():
    documents = tika_convert_files_to_dicts(dir_path="samples", clean_func=clean_wiki_text, split_paragraphs=True)
    assert documents and len(documents) > 0