mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-08 13:06:29 +00:00
Add CI for windows runner (#1458)
* Feat: Removing use of temp file while downloading archive from url along with adding CI for windows and mac platform * Windows CI by default installing pytorch gpu hence updating CI to pick cpu version * fixing mac cache build issue * updating windows pip install command for torch * another attempt * updating ci * Adding sudo * fixing ls failure on windows * another attempt to fix build issue * Saving env variable of test files * Adding debug log * Github action differ on windows * adding debug * anohter attempt * Windows have different ways to receive env * fixing template * minor fx * Adding debug * Removing use of json * Adding back fromJson * addin toJson * removing print * anohter attempt * disabling parallel run at least for testing * installing docker for mac runner * correcting docker install command * Linux dockers are not suported in windows * Removing mac changes * Upgrading pytorch * using lts pytorch * Separating win and ubuntu * Install java 11 * enabling linux container env * docker cli command * docker cli command * start elastic service * List all service * correcting service name * Attempt to fix multiple test run * convert to json * another attempt to check * Updating build cache step * attempt * Add tika * Separating windows CI * Changing CI name * Skipping test which does not work in windows * Skipping tests for windows * create cleanup function in conftest * adding skipif marker on tests * Run windows PR on only push to master * Addressing review comments * Enabling windows ci for this PR * Tika init is being called when importing tika function * handling tika import issue * handling tika import issue in test * Fixing import issue * removing tika fixure * Removing fixture from tests * Disable windows ci on pull request * Add back extra pytorch install step Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
This commit is contained in:
parent
08341f5698
commit
e5b4b62d75
@ -1,4 +1,4 @@
|
||||
name: Build
|
||||
name: Linux CI
|
||||
|
||||
on:
|
||||
push:
|
||||
@ -33,7 +33,7 @@ jobs:
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ env.pythonLocation }}
|
||||
key: ${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
|
||||
key: linux-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
|
||||
- name: Install dependencies
|
||||
if: steps.cache-python-env.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
@ -70,7 +70,7 @@ jobs:
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ env.pythonLocation }}
|
||||
key: ${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
|
||||
key: linux-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
|
||||
- name: Run Elasticsearch
|
||||
run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx128m" elasticsearch:7.9.2
|
||||
|
||||
97
.github/workflows/windows_ci.yml
vendored
Normal file
97
.github/workflows/windows_ci.yml
vendored
Normal file
@ -0,0 +1,97 @@
|
||||
name: Windows CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ master ]
|
||||
# pull_request:
|
||||
# branches: [ master ]
|
||||
|
||||
jobs:
|
||||
type-check:
|
||||
runs-on: windows-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.8
|
||||
- name: Test with mypy
|
||||
run: |
|
||||
pip install mypy types-Markdown types-requests types-PyYAML pydantic
|
||||
mypy haystack
|
||||
|
||||
build-cache:
|
||||
needs: type-check
|
||||
runs-on: windows-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.7
|
||||
- run: echo "date=$(date +'%Y-%m-%d')" >> $env:GITHUB_ENV
|
||||
- name: Cache
|
||||
id: cache-python-env
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ env.pythonLocation }}
|
||||
key: windows-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
|
||||
- name: Install Pytorch on windows
|
||||
run: |
|
||||
pip install torch==1.8.1+cpu -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
|
||||
- name: Install dependencies
|
||||
if: steps.cache-python-env.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install --upgrade --upgrade-strategy eager -r requirements-dev.txt -e .
|
||||
pip install --upgrade --upgrade-strategy eager -f https://download.pytorch.org/whl/torch_stable.html -r requirements.txt -e .
|
||||
pip install torch-scatter -f https://data.pyg.org/whl/torch-1.9.0+cpu.html
|
||||
|
||||
prepare-build:
|
||||
needs: build-cache
|
||||
# With Windows it gives error, also this step only listing test files only
|
||||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- id: set-matrix
|
||||
run: |
|
||||
echo "::set-output name=matrix::$(cd test && ls -d test_*.py | jq -R . | jq -cs .)"
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
|
||||
build:
|
||||
needs: prepare-build
|
||||
runs-on: windows-latest
|
||||
strategy:
|
||||
matrix:
|
||||
test-path: ${{fromJson(needs.prepare-build.outputs.matrix)}}
|
||||
fail-fast: false
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python 3.7
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.7
|
||||
- run: echo "date=$(date +'%Y-%m-%d')" >> $env:GITHUB_ENV
|
||||
- name: Cache
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ env.pythonLocation }}
|
||||
key: windows-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
|
||||
|
||||
# Windows runner can't run Linux containers. Refer https://github.com/actions/virtual-environments/issues/1143
|
||||
- name: Set up Windows test env
|
||||
run: |
|
||||
choco install xpdf-utils
|
||||
choco install openjdk11
|
||||
refreshenv
|
||||
choco install tesseract --pre
|
||||
choco install elasticsearch --version=7.9.2
|
||||
refreshenv
|
||||
Get-Service elasticsearch-service-x64 | Start-Service
|
||||
|
||||
# We have to remove files if not test going to run from it
|
||||
# As on windows we are going to disable quite a few tests these, hence these files will throw error refer https://github.com/pytest-dev/pytest/issues/812
|
||||
# Removing test_ray, test_utils, test_preprocessor, test_knowledge_graph and test_connector
|
||||
- name: Run tests
|
||||
if: ${{ !contains(fromJSON('["test_ray.py", "test_knowledge_graph.py", "test_connector.py"]'), matrix.test-path) }}
|
||||
run: cd test && pytest --document_store_type=memory,faiss,elasticsearch -m "not tika and not graphdb" -s ${{ matrix.test-path }}
|
||||
@ -8,7 +8,6 @@ from haystack.nodes.file_converter import (
|
||||
BaseConverter,
|
||||
DocxToTextConverter,
|
||||
PDFToTextConverter,
|
||||
TikaConverter,
|
||||
TextConverter
|
||||
)
|
||||
|
||||
@ -99,6 +98,11 @@ def tika_convert_files_to_dicts(
|
||||
:param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
|
||||
:param split_paragraphs: split text in paragraphs.
|
||||
"""
|
||||
try:
|
||||
from haystack.nodes.file_converter import TikaConverter
|
||||
except Exception as ex:
|
||||
logger.error("Tika not installed. Please install tika and try again. Error: {}".format(ex))
|
||||
raise ex
|
||||
converter = TikaConverter()
|
||||
paths = [p for p in Path(dir_path).glob("**/*")]
|
||||
allowed_suffixes = [".pdf", ".txt"]
|
||||
|
||||
@ -224,7 +224,7 @@ def tika_fixture():
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def xpdf_fixture(tika_fixture):
|
||||
def xpdf_fixture():
|
||||
verify_installation = run(["pdftotext"], shell=True)
|
||||
if verify_installation.returncode == 127:
|
||||
if platform.startswith("linux"):
|
||||
|
||||
@ -68,6 +68,7 @@ def test_write_with_duplicate_doc_ids_custom_index(document_store):
|
||||
# writing to the default, empty index should still work
|
||||
document_store.write_documents(documents, duplicate_documents="fail")
|
||||
|
||||
|
||||
def test_get_all_documents_without_filters(document_store_with_docs):
|
||||
documents = document_store_with_docs.get_all_documents()
|
||||
assert all(isinstance(d, Document) for d in documents)
|
||||
@ -812,7 +813,7 @@ def test_get_meta_values_by_key(document_store):
|
||||
|
||||
|
||||
@pytest.mark.elasticsearch
|
||||
def test_elasticsearch_custom_fields(elasticsearch_fixture):
|
||||
def test_elasticsearch_custom_fields():
|
||||
client = Elasticsearch()
|
||||
client.indices.delete(index='haystack_test_custom', ignore=[404])
|
||||
document_store = ElasticsearchDocumentStore(index="haystack_test_custom", content_field="custom_text_field",
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import time
|
||||
import sys
|
||||
|
||||
import faiss
|
||||
import math
|
||||
import numpy as np
|
||||
@ -19,6 +20,7 @@ DOCUMENTS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
|
||||
def test_faiss_index_save_and_load(tmp_path):
|
||||
document_store = FAISSDocumentStore(
|
||||
sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}",
|
||||
@ -47,6 +49,7 @@ def test_faiss_index_save_and_load(tmp_path):
|
||||
assert not new_document_store.progress_bar
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
|
||||
def test_faiss_index_save_and_load_custom_path(tmp_path):
|
||||
document_store = FAISSDocumentStore(
|
||||
sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}",
|
||||
@ -95,7 +98,7 @@ def test_faiss_write_docs(document_store, index_buffer_size, batch_size):
|
||||
stored_emb = document_store.faiss_indexes[document_store.index].reconstruct(int(doc.meta["vector_id"]))
|
||||
# compare original input vec with stored one (ignore extra dim added by hnsw)
|
||||
assert np.allclose(original_doc["embedding"], stored_emb, rtol=0.01)
|
||||
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
||||
@ -158,6 +161,7 @@ def test_update_with_empty_store(document_store, retriever):
|
||||
assert len(documents_indexed) == len(DOCUMENTS)
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
|
||||
@pytest.mark.parametrize("index_factory", ["Flat", "HNSW", "IVF1,Flat"])
|
||||
def test_faiss_retrieving(index_factory, tmp_path):
|
||||
document_store = FAISSDocumentStore(
|
||||
@ -253,7 +257,7 @@ def test_delete_docs_by_id_with_filters(document_store, retriever):
|
||||
all_ids_left = [doc.id for doc in documents]
|
||||
assert all(doc_id in all_ids_left for doc_id in ids_not_to_delete)
|
||||
|
||||
|
||||
|
||||
|
||||
@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
|
||||
@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True)
|
||||
@ -271,6 +275,7 @@ def test_pipeline(document_store, retriever):
|
||||
assert len(output["documents"]) == 3
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
|
||||
def test_faiss_passing_index_from_outside(tmp_path):
|
||||
d = 768
|
||||
nlist = 2
|
||||
@ -295,6 +300,7 @@ def test_faiss_passing_index_from_outside(tmp_path):
|
||||
assert 0 <= int(doc.meta["vector_id"]) <= 7
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
|
||||
def test_faiss_cosine_similarity(tmp_path):
|
||||
document_store = FAISSDocumentStore(
|
||||
sql_url=f"sqlite:////{tmp_path/'haystack_test_faiss.db'}", similarity='cosine'
|
||||
@ -322,7 +328,7 @@ def test_faiss_cosine_similarity(tmp_path):
|
||||
|
||||
# check if the stored embedding was normalized
|
||||
assert np.allclose(original_emb[0], result_emb, rtol=0.01)
|
||||
|
||||
|
||||
# check if the score is plausible for cosine similarity
|
||||
assert 0 <= doc.score <= 1.0
|
||||
|
||||
@ -342,7 +348,7 @@ def test_faiss_cosine_similarity(tmp_path):
|
||||
assert not np.allclose(original_emb[0], doc.embedding, rtol=0.01)
|
||||
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
|
||||
def test_faiss_cosine_sanity_check(tmp_path):
|
||||
document_store = FAISSDocumentStore(
|
||||
sql_url=f"sqlite:////{tmp_path/'haystack_test_faiss.db'}", similarity='cosine',
|
||||
|
||||
@ -13,7 +13,7 @@ from haystack.file_converter.tika import TikaConverter
|
||||
# "Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter]
|
||||
"Converter", [PDFToTextOCRConverter]
|
||||
)
|
||||
def test_convert(Converter, xpdf_fixture):
|
||||
def test_convert(Converter):
|
||||
converter = Converter()
|
||||
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
pages = document["content"].split("\f")
|
||||
@ -31,7 +31,7 @@ def test_convert(Converter, xpdf_fixture):
|
||||
|
||||
@pytest.mark.tika
|
||||
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
|
||||
def test_table_removal(Converter, xpdf_fixture):
|
||||
def test_table_removal(Converter):
|
||||
converter = Converter(remove_numeric_tables=True)
|
||||
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
pages = document["content"].split("\f")
|
||||
@ -42,7 +42,7 @@ def test_table_removal(Converter, xpdf_fixture):
|
||||
|
||||
@pytest.mark.tika
|
||||
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
|
||||
def test_language_validation(Converter, xpdf_fixture, caplog):
|
||||
def test_language_validation(Converter, caplog):
|
||||
converter = Converter(valid_languages=["en"])
|
||||
converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
assert (
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import sys
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
@ -428,6 +429,7 @@ def test_generator_pipeline(document_store, retriever, rag_generator):
|
||||
assert "berlin" in answers[0]["answer"]
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Gives memory allocation error on windows runner")
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.generator
|
||||
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
|
||||
|
||||
@ -6,8 +6,9 @@ from haystack.nodes import Text2SparqlRetriever
|
||||
from haystack.document_stores import GraphDBKnowledgeGraph
|
||||
from haystack.utils import fetch_archive_from_http
|
||||
|
||||
|
||||
@pytest.mark.graphdb
|
||||
def test_graph_retrieval(graphdb_fixture):
|
||||
def test_graph_retrieval():
|
||||
# TODO rename doc_dir
|
||||
graph_dir = "../data/tutorial10_knowledge_graph/"
|
||||
s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip"
|
||||
|
||||
@ -18,7 +18,6 @@ in the sentence.
|
||||
"""
|
||||
|
||||
|
||||
@pytest.mark.tika
|
||||
def test_preprocess_sentence_split():
|
||||
document = {"content": TEXT}
|
||||
preprocessor = PreProcessor(split_length=1, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False)
|
||||
@ -32,7 +31,6 @@ def test_preprocess_sentence_split():
|
||||
assert len(documents) == 2
|
||||
|
||||
|
||||
@pytest.mark.tika
|
||||
def test_preprocess_word_split():
|
||||
document = {"content": TEXT}
|
||||
preprocessor = PreProcessor(split_length=10, split_overlap=0, split_by="word", split_respect_sentence_boundary=False)
|
||||
@ -56,7 +54,6 @@ def test_preprocess_word_split():
|
||||
assert len(documents) == 15
|
||||
|
||||
|
||||
@pytest.mark.tika
|
||||
def test_preprocess_passage_split():
|
||||
document = {"content": TEXT}
|
||||
preprocessor = PreProcessor(split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False)
|
||||
@ -68,7 +65,6 @@ def test_preprocess_passage_split():
|
||||
assert len(documents) == 2
|
||||
|
||||
|
||||
@pytest.mark.tika
|
||||
def test_clean_header_footer():
|
||||
converter = PDFToTextConverter()
|
||||
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header/footer
|
||||
|
||||
@ -94,7 +94,7 @@ def test_retrieval(retriever_with_docs, document_store_with_docs):
|
||||
|
||||
|
||||
@pytest.mark.elasticsearch
|
||||
def test_elasticsearch_custom_query(elasticsearch_fixture):
|
||||
def test_elasticsearch_custom_query():
|
||||
client = Elasticsearch()
|
||||
client.indices.delete(index="haystack_test_custom", ignore=[404])
|
||||
document_store = ElasticsearchDocumentStore(
|
||||
|
||||
@ -1,17 +1,16 @@
|
||||
import pytest
|
||||
|
||||
from haystack.preprocessor.utils import convert_files_to_dicts, tika_convert_files_to_dicts
|
||||
from haystack.preprocessor.cleaning import clean_wiki_text
|
||||
from haystack.utils.preprocessing import convert_files_to_dicts, tika_convert_files_to_dicts
|
||||
from haystack.utils.cleaning import clean_wiki_text
|
||||
|
||||
|
||||
@pytest.mark.tika
|
||||
def test_convert_files_to_dicts(xpdf_fixture):
|
||||
def test_convert_files_to_dicts():
|
||||
documents = convert_files_to_dicts(dir_path="samples", clean_func=clean_wiki_text, split_paragraphs=True)
|
||||
assert documents and len(documents) > 0
|
||||
|
||||
|
||||
@pytest.mark.tika
|
||||
def test_tika_convert_files_to_dicts(tika_fixture):
|
||||
def test_tika_convert_files_to_dicts():
|
||||
documents = tika_convert_files_to_dicts(dir_path="samples", clean_func=clean_wiki_text, split_paragraphs=True)
|
||||
assert documents and len(documents) > 0
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user