haystack/test/nodes/test_file_converter.py

643 lines
24 KiB
Python
Raw Normal View History

from typing import List
import os
[CI Refactoring] Workflow refactoring (#2576) * Unify CI tests (from #2466) * Update Documentation & Code Style * Change folder names * Fix markers list * Remove marker 'slow', replaced with 'integration' * Soften children check * Start ES first so it has time to boot while Python is setup * Run the full workflow * Try to make pip upgrade on Windows * Set KG tests as integration * Update Documentation & Code Style * typo * faster pylint * Make Pylint use the cache * filter diff files for pylint * debug pylint statement * revert pylint changes * Remove path from asserted log (fails on Windows) * Skip preprocessor test on Windows * Tackling Windows specific failures * Fix pytest command for windows suites * Remove \ from command * Move poppler test into integration * Skip opensearch test on windows * Add tolerance in reader sas score for Windows * Another pytorch approx * Raise time limit for unit tests :( * Skip poppler test on Windows CI * Specify to pull with FF only in docs check * temporarily run the docs check immediately * Allow merge commit for now * Try without fetch depth * Accelerating test * Accelerating test * Add repository and ref alongside fetch-depth * Separate out code&docs check from tests * Use setup-python cache * Delete custom action * Remove the pull step in the docs check, will find a way to run on bot commits * Add requirements.txt in .github for caching * Actually install dependencies * Change deps group for pylint * Unclear why the requirements.txt is still required :/ * Fix the code check python setup * Install all deps for pylint * Make the autoformat check depend on tests and doc updates workflows * Try installing dependencies in another order * Try again to install the deps * quoting the paths * Ad back the requirements * Try again to install rest_api and ui * Change deps group * Duplicate haystack install line * See if the cache is the problem * Disable also in mypy, who knows * split the install step * Split install step everywhere * Revert "Separate out code&docs check from tests" This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd. * Add back the action * Proactive support for audio (see text2speech branch) * Fix label generator tests * Remove install of libsndfile1 on win temporarily * exclude audio tests on win * install ffmpeg for integration tests Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2022-06-07 09:23:03 +02:00
import sys
from pathlib import Path
import subprocess
import csv
import json
2020-06-08 11:07:19 +02:00
import pandas as pd
2020-08-17 11:21:09 +02:00
import pytest
from haystack import Document
from haystack.nodes import (
MarkdownConverter,
DocxToTextConverter,
PDFToTextConverter,
PDFToTextOCRConverter,
TikaConverter,
AzureConverter,
ParsrConverter,
TextConverter,
CsvTextConverter,
JsonConverter,
PreProcessor,
)
2020-06-08 11:07:19 +02:00
from ..conftest import SAMPLES_PATH, fail_at_version
[RAG] Integrate "Retrieval-Augmented Generation" with Haystack (#484) * Adding dummy generator implementation * Adding tutorial to try the model * Committing current non working code * Committing current update where we need to call generate function directly and need to convert embedding to tensor way * Addressing review comments. * Refactoring finder, and implementing rag_generator class. * Refined the implementation of RAGGenerator and now it is in clean shape * Renaming RAGGenerator to RAGenerator * Reverting change from finder.py and addressing review comments * Remove support for RagSequenceForGeneration * Utilizing embed_passage function from DensePassageRetriever * Adding sample test data to verify generator output * Updating testing script * Updating testing script * Fixing bug related to top_k * Updating latest farm dependency * Comment out farm dependency * Reverting changes from TransformersReader * Adding transformers dataset to compare transformers and haystack generator implementation * Using generator_encoder instead of question_encoder to generate context_input_ids * Adding workaround to install FARM dependency from master branch * Removing unnecessary changes * Fixing generator test * Removing transformers datasets * Fixing generator test * Some cleanup and updating TODO comments * Adding tutorial notebook * Updating tutorials with comments * Explicitly passing token model in RAG test * Addressing review comments * Fixing notebook * Refactoring tests to reduce memory footprint * Split generator tests in separate ci step and before running it reclaim memory by terminating containers * Moving tika dependent test to separate dir * Remove unwanted code * Brining reader under session scope * Farm is now session object hence restoring changes from default value * Updating assert for pdf converter * Dummy commit to trigger CI flow * REducing memory footprint required for generator tests * Fixing mypy issues * Marking test with tika and elasticsearch markers. Reverting changes in CI and pytest splits * reducing changes * Fixing CI * changing elastic search ci * Fixing test error * Disabling return of embedding * Marking generator test as well * Refactoring tutorials * Increasing ES memory to 750M * Trying another fix for ES CI * Reverting CI changes * Splitting tests in CI * Generator and non-generator markers split * Adding pytest.ini to add markers and enable strict-markers option * Reducing elastic search container memory * Simplifying generator test by using documents with embedding directly * Bump up farm to 0.5.0
2020-10-30 18:06:02 +01:00
@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter])
Add CI for windows runner (#1458) * Feat: Removing use of temp file while downloading archive from url along with adding CI for windows and mac platform * Windows CI by default installing pytorch gpu hence updating CI to pick cpu version * fixing mac cache build issue * updating windows pip install command for torch * another attempt * updating ci * Adding sudo * fixing ls failure on windows * another attempt to fix build issue * Saving env variable of test files * Adding debug log * Github action differ on windows * adding debug * anohter attempt * Windows have different ways to receive env * fixing template * minor fx * Adding debug * Removing use of json * Adding back fromJson * addin toJson * removing print * anohter attempt * disabling parallel run at least for testing * installing docker for mac runner * correcting docker install command * Linux dockers are not suported in windows * Removing mac changes * Upgrading pytorch * using lts pytorch * Separating win and ubuntu * Install java 11 * enabling linux container env * docker cli command * docker cli command * start elastic service * List all service * correcting service name * Attempt to fix multiple test run * convert to json * another attempt to check * Updating build cache step * attempt * Add tika * Separating windows CI * Changing CI name * Skipping test which does not work in windows * Skipping tests for windows * create cleanup function in conftest * adding skipif marker on tests * Run windows PR on only push to master * Addressing review comments * Enabling windows ci for this PR * Tika init is being called when importing tika function * handling tika import issue * handling tika import issue in test * Fixing import issue * removing tika fixure * Removing fixture from tests * Disable windows ci on pull request * Add back extra pytorch install step Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
2021-10-29 13:52:28 +05:30
def test_convert(Converter):
2020-08-17 11:21:09 +02:00
converter = Converter()
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]["documents"][0]
pages = document.content.split("\f")
assert (
len(pages) != 1 and pages[0] != ""
), f'{type(converter).__name__} did return a single empty page indicating a potential issue with your installed poppler version. Try installing via "conda install -c conda-forge poppler" and check test_pdftoppm_command_format()'
2020-06-08 11:07:19 +02:00
assert len(pages) == 4 # the sample PDF file has four pages.
assert pages[0] != "" # the page 1 of PDF contains text.
assert pages[2] == "" # the page 3 of PDF file is empty.
# assert text is retained from the document.
# As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace
page_standard_whitespace = " ".join(pages[0].split())
assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace
2020-06-08 11:07:19 +02:00
[CI Refactoring] Workflow refactoring (#2576) * Unify CI tests (from #2466) * Update Documentation & Code Style * Change folder names * Fix markers list * Remove marker 'slow', replaced with 'integration' * Soften children check * Start ES first so it has time to boot while Python is setup * Run the full workflow * Try to make pip upgrade on Windows * Set KG tests as integration * Update Documentation & Code Style * typo * faster pylint * Make Pylint use the cache * filter diff files for pylint * debug pylint statement * revert pylint changes * Remove path from asserted log (fails on Windows) * Skip preprocessor test on Windows * Tackling Windows specific failures * Fix pytest command for windows suites * Remove \ from command * Move poppler test into integration * Skip opensearch test on windows * Add tolerance in reader sas score for Windows * Another pytorch approx * Raise time limit for unit tests :( * Skip poppler test on Windows CI * Specify to pull with FF only in docs check * temporarily run the docs check immediately * Allow merge commit for now * Try without fetch depth * Accelerating test * Accelerating test * Add repository and ref alongside fetch-depth * Separate out code&docs check from tests * Use setup-python cache * Delete custom action * Remove the pull step in the docs check, will find a way to run on bot commits * Add requirements.txt in .github for caching * Actually install dependencies * Change deps group for pylint * Unclear why the requirements.txt is still required :/ * Fix the code check python setup * Install all deps for pylint * Make the autoformat check depend on tests and doc updates workflows * Try installing dependencies in another order * Try again to install the deps * quoting the paths * Ad back the requirements * Try again to install rest_api and ui * Change deps group * Duplicate haystack install line * See if the cache is the problem * Disable also in mypy, who knows * split the install step * Split install step everywhere * Revert "Separate out code&docs check from tests" This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd. * Add back the action * Proactive support for audio (see text2speech branch) * Fix label generator tests * Remove install of libsndfile1 on win temporarily * exclude audio tests on win * install ffmpeg for integration tests Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2022-06-07 09:23:03 +02:00
# Marked as integration because it uses poppler, which is not installed in the unit tests suite
@pytest.mark.integration
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Poppler not installed on Windows CI")
def test_pdftoppm_command_format():
# Haystack's PDFToTextOCRConverter uses pdf2image, which calls pdftoppm internally.
# Some installations of pdftoppm are incompatible with Haystack and won't raise an error but just return empty converted documents
# This test runs pdftoppm directly to check whether pdftoppm accepts the command format that pdf2image uses in Haystack
proc = subprocess.Popen(
["pdftoppm", f"{SAMPLES_PATH}/pdf/sample_pdf_1.pdf"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
out, err = proc.communicate()
# If usage info of pdftoppm is sent to stderr then it's because Haystack's pdf2image uses an incompatible command format
assert (
not err
), 'Your installation of poppler is incompatible with Haystack. Try installing via "conda install -c conda-forge poppler"'
@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_command_whitespaces(Converter):
converter = Converter()
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample pdf file with spaces on file name.pdf")[0][
"documents"
][0]
assert "ɪ" in document.content
@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_encoding(Converter):
converter = Converter()
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_5.pdf")[0]["documents"][0]
assert "Ж" in document.content
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
assert "ɪ" in document.content
@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_sort_by_position(Converter):
converter = Converter(sort_by_position=True)
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_3.pdf")[0]
assert str(document.content).startswith("This is the second test sentence.")
@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_ligatures(Converter):
converter = Converter()
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
assert "" not in document.content
assert "ɪ" in document.content
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={})[0]["documents"][
0
]
assert "" in document.content
assert "ɪ" in document.content
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={"ɪ": "i"})[0][
"documents"
][0]
assert "" in document.content
assert "ɪ" not in document.content
@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_page_range(Converter):
converter = Converter()
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]
pages = document.content.split("\f")
assert (
len(pages) == 4
) # the sample PDF file has four pages, we skipped first (but we wanna correct number of pages)
assert pages[0] == "" # the page 1 was skipped.
assert pages[1] != "" # the page 2 is not empty.
assert pages[2] == "" # the page 3 is empty.
@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_page_range_numbers(Converter):
converter = Converter()
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]
preprocessor = PreProcessor(
split_by="word", split_length=5, split_overlap=0, split_respect_sentence_boundary=False, add_page_number=True
)
documents = preprocessor.process([document])
assert documents[1].meta["page"] == 4
@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_parallel(Converter):
converter = Converter(multiprocessing=True)
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_6.pdf")[0]
pages = document.content.split("\f")
assert pages[0] == "This is the page 1 of the document."
assert pages[-1] == "This is the page 50 of the document."
@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_parallel_page_range(Converter):
converter = Converter(multiprocessing=True)
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_6.pdf", start_page=2)[0]
pages = document.content.split("\f")
assert pages[0] == ""
assert len(pages) == 50
@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_parallel_sort_by_position(Converter):
converter = Converter(multiprocessing=True, sort_by_position=True)
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_6.pdf")[0]
pages = document.content.split("\f")
assert pages[0] == "This is the page 1 of the document."
assert pages[-1] == "This is the page 50 of the document."
@fail_at_version(1, 17)
def test_deprecated_encoding():
with pytest.warns(DeprecationWarning):
converter = PDFToTextConverter(encoding="utf-8")
@fail_at_version(1, 17)
def test_deprecated_encoding_in_convert_method():
converter = PDFToTextConverter()
with pytest.warns(DeprecationWarning):
converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", encoding="utf-8")
@fail_at_version(1, 17)
def test_deprecated_keep_physical_layout():
with pytest.warns(DeprecationWarning):
converter = PDFToTextConverter(keep_physical_layout=True)
@fail_at_version(1, 17)
def test_deprecated_keep_physical_layout_in_convert_method():
converter = PDFToTextConverter()
with pytest.warns(DeprecationWarning):
converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", keep_physical_layout=True)
[RAG] Integrate "Retrieval-Augmented Generation" with Haystack (#484) * Adding dummy generator implementation * Adding tutorial to try the model * Committing current non working code * Committing current update where we need to call generate function directly and need to convert embedding to tensor way * Addressing review comments. * Refactoring finder, and implementing rag_generator class. * Refined the implementation of RAGGenerator and now it is in clean shape * Renaming RAGGenerator to RAGenerator * Reverting change from finder.py and addressing review comments * Remove support for RagSequenceForGeneration * Utilizing embed_passage function from DensePassageRetriever * Adding sample test data to verify generator output * Updating testing script * Updating testing script * Fixing bug related to top_k * Updating latest farm dependency * Comment out farm dependency * Reverting changes from TransformersReader * Adding transformers dataset to compare transformers and haystack generator implementation * Using generator_encoder instead of question_encoder to generate context_input_ids * Adding workaround to install FARM dependency from master branch * Removing unnecessary changes * Fixing generator test * Removing transformers datasets * Fixing generator test * Some cleanup and updating TODO comments * Adding tutorial notebook * Updating tutorials with comments * Explicitly passing token model in RAG test * Addressing review comments * Fixing notebook * Refactoring tests to reduce memory footprint * Split generator tests in separate ci step and before running it reclaim memory by terminating containers * Moving tika dependent test to separate dir * Remove unwanted code * Brining reader under session scope * Farm is now session object hence restoring changes from default value * Updating assert for pdf converter * Dummy commit to trigger CI flow * REducing memory footprint required for generator tests * Fixing mypy issues * Marking test with tika and elasticsearch markers. Reverting changes in CI and pytest splits * reducing changes * Fixing CI * changing elastic search ci * Fixing test error * Disabling return of embedding * Marking generator test as well * Refactoring tutorials * Increasing ES memory to 750M * Trying another fix for ES CI * Reverting CI changes * Splitting tests in CI * Generator and non-generator markers split * Adding pytest.ini to add markers and enable strict-markers option * Reducing elastic search container memory * Simplifying generator test by using documents with embedding directly * Bump up farm to 0.5.0
2020-10-30 18:06:02 +01:00
@pytest.mark.tika
2020-08-17 11:21:09 +02:00
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
Add CI for windows runner (#1458) * Feat: Removing use of temp file while downloading archive from url along with adding CI for windows and mac platform * Windows CI by default installing pytorch gpu hence updating CI to pick cpu version * fixing mac cache build issue * updating windows pip install command for torch * another attempt * updating ci * Adding sudo * fixing ls failure on windows * another attempt to fix build issue * Saving env variable of test files * Adding debug log * Github action differ on windows * adding debug * anohter attempt * Windows have different ways to receive env * fixing template * minor fx * Adding debug * Removing use of json * Adding back fromJson * addin toJson * removing print * anohter attempt * disabling parallel run at least for testing * installing docker for mac runner * correcting docker install command * Linux dockers are not suported in windows * Removing mac changes * Upgrading pytorch * using lts pytorch * Separating win and ubuntu * Install java 11 * enabling linux container env * docker cli command * docker cli command * start elastic service * List all service * correcting service name * Attempt to fix multiple test run * convert to json * another attempt to check * Updating build cache step * attempt * Add tika * Separating windows CI * Changing CI name * Skipping test which does not work in windows * Skipping tests for windows * create cleanup function in conftest * adding skipif marker on tests * Run windows PR on only push to master * Addressing review comments * Enabling windows ci for this PR * Tika init is being called when importing tika function * handling tika import issue * handling tika import issue in test * Fixing import issue * removing tika fixure * Removing fixture from tests * Disable windows ci on pull request * Add back extra pytorch install step Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
2021-10-29 13:52:28 +05:30
def test_table_removal(Converter):
2020-08-17 11:21:09 +02:00
converter = Converter(remove_numeric_tables=True)
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]
pages = document.content.split("\f")
2020-06-08 11:07:19 +02:00
# assert numeric rows are removed from the table.
assert "324" not in pages[0]
assert "54x growth" not in pages[0]
[RAG] Integrate "Retrieval-Augmented Generation" with Haystack (#484) * Adding dummy generator implementation * Adding tutorial to try the model * Committing current non working code * Committing current update where we need to call generate function directly and need to convert embedding to tensor way * Addressing review comments. * Refactoring finder, and implementing rag_generator class. * Refined the implementation of RAGGenerator and now it is in clean shape * Renaming RAGGenerator to RAGenerator * Reverting change from finder.py and addressing review comments * Remove support for RagSequenceForGeneration * Utilizing embed_passage function from DensePassageRetriever * Adding sample test data to verify generator output * Updating testing script * Updating testing script * Fixing bug related to top_k * Updating latest farm dependency * Comment out farm dependency * Reverting changes from TransformersReader * Adding transformers dataset to compare transformers and haystack generator implementation * Using generator_encoder instead of question_encoder to generate context_input_ids * Adding workaround to install FARM dependency from master branch * Removing unnecessary changes * Fixing generator test * Removing transformers datasets * Fixing generator test * Some cleanup and updating TODO comments * Adding tutorial notebook * Updating tutorials with comments * Explicitly passing token model in RAG test * Addressing review comments * Fixing notebook * Refactoring tests to reduce memory footprint * Split generator tests in separate ci step and before running it reclaim memory by terminating containers * Moving tika dependent test to separate dir * Remove unwanted code * Brining reader under session scope * Farm is now session object hence restoring changes from default value * Updating assert for pdf converter * Dummy commit to trigger CI flow * REducing memory footprint required for generator tests * Fixing mypy issues * Marking test with tika and elasticsearch markers. Reverting changes in CI and pytest splits * reducing changes * Fixing CI * changing elastic search ci * Fixing test error * Disabling return of embedding * Marking generator test as well * Refactoring tutorials * Increasing ES memory to 750M * Trying another fix for ES CI * Reverting CI changes * Splitting tests in CI * Generator and non-generator markers split * Adding pytest.ini to add markers and enable strict-markers option * Reducing elastic search container memory * Simplifying generator test by using documents with embedding directly * Bump up farm to 0.5.0
2020-10-30 18:06:02 +01:00
@pytest.mark.tika
2020-08-17 11:21:09 +02:00
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
Add CI for windows runner (#1458) * Feat: Removing use of temp file while downloading archive from url along with adding CI for windows and mac platform * Windows CI by default installing pytorch gpu hence updating CI to pick cpu version * fixing mac cache build issue * updating windows pip install command for torch * another attempt * updating ci * Adding sudo * fixing ls failure on windows * another attempt to fix build issue * Saving env variable of test files * Adding debug log * Github action differ on windows * adding debug * anohter attempt * Windows have different ways to receive env * fixing template * minor fx * Adding debug * Removing use of json * Adding back fromJson * addin toJson * removing print * anohter attempt * disabling parallel run at least for testing * installing docker for mac runner * correcting docker install command * Linux dockers are not suported in windows * Removing mac changes * Upgrading pytorch * using lts pytorch * Separating win and ubuntu * Install java 11 * enabling linux container env * docker cli command * docker cli command * start elastic service * List all service * correcting service name * Attempt to fix multiple test run * convert to json * another attempt to check * Updating build cache step * attempt * Add tika * Separating windows CI * Changing CI name * Skipping test which does not work in windows * Skipping tests for windows * create cleanup function in conftest * adding skipif marker on tests * Run windows PR on only push to master * Addressing review comments * Enabling windows ci for this PR * Tika init is being called when importing tika function * handling tika import issue * handling tika import issue in test * Fixing import issue * removing tika fixure * Removing fixture from tests * Disable windows ci on pull request * Add back extra pytorch install step Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
2021-10-29 13:52:28 +05:30
def test_language_validation(Converter, caplog):
2020-08-17 11:21:09 +02:00
converter = Converter(valid_languages=["en"])
converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
[CI Refactoring] Workflow refactoring (#2576) * Unify CI tests (from #2466) * Update Documentation & Code Style * Change folder names * Fix markers list * Remove marker 'slow', replaced with 'integration' * Soften children check * Start ES first so it has time to boot while Python is setup * Run the full workflow * Try to make pip upgrade on Windows * Set KG tests as integration * Update Documentation & Code Style * typo * faster pylint * Make Pylint use the cache * filter diff files for pylint * debug pylint statement * revert pylint changes * Remove path from asserted log (fails on Windows) * Skip preprocessor test on Windows * Tackling Windows specific failures * Fix pytest command for windows suites * Remove \ from command * Move poppler test into integration * Skip opensearch test on windows * Add tolerance in reader sas score for Windows * Another pytorch approx * Raise time limit for unit tests :( * Skip poppler test on Windows CI * Specify to pull with FF only in docs check * temporarily run the docs check immediately * Allow merge commit for now * Try without fetch depth * Accelerating test * Accelerating test * Add repository and ref alongside fetch-depth * Separate out code&docs check from tests * Use setup-python cache * Delete custom action * Remove the pull step in the docs check, will find a way to run on bot commits * Add requirements.txt in .github for caching * Actually install dependencies * Change deps group for pylint * Unclear why the requirements.txt is still required :/ * Fix the code check python setup * Install all deps for pylint * Make the autoformat check depend on tests and doc updates workflows * Try installing dependencies in another order * Try again to install the deps * quoting the paths * Ad back the requirements * Try again to install rest_api and ui * Change deps group * Duplicate haystack install line * See if the cache is the problem * Disable also in mypy, who knows * split the install step * Split install step everywhere * Revert "Separate out code&docs check from tests" This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd. * Add back the action * Proactive support for audio (see text2speech branch) * Fix label generator tests * Remove install of libsndfile1 on win temporarily * exclude audio tests on win * install ffmpeg for integration tests Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2022-06-07 09:23:03 +02:00
assert "sample_pdf_1.pdf is not one of ['en']." not in caplog.text
2020-06-08 11:07:19 +02:00
2020-08-17 11:21:09 +02:00
converter = Converter(valid_languages=["de"])
converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
[CI Refactoring] Workflow refactoring (#2576) * Unify CI tests (from #2466) * Update Documentation & Code Style * Change folder names * Fix markers list * Remove marker 'slow', replaced with 'integration' * Soften children check * Start ES first so it has time to boot while Python is setup * Run the full workflow * Try to make pip upgrade on Windows * Set KG tests as integration * Update Documentation & Code Style * typo * faster pylint * Make Pylint use the cache * filter diff files for pylint * debug pylint statement * revert pylint changes * Remove path from asserted log (fails on Windows) * Skip preprocessor test on Windows * Tackling Windows specific failures * Fix pytest command for windows suites * Remove \ from command * Move poppler test into integration * Skip opensearch test on windows * Add tolerance in reader sas score for Windows * Another pytorch approx * Raise time limit for unit tests :( * Skip poppler test on Windows CI * Specify to pull with FF only in docs check * temporarily run the docs check immediately * Allow merge commit for now * Try without fetch depth * Accelerating test * Accelerating test * Add repository and ref alongside fetch-depth * Separate out code&docs check from tests * Use setup-python cache * Delete custom action * Remove the pull step in the docs check, will find a way to run on bot commits * Add requirements.txt in .github for caching * Actually install dependencies * Change deps group for pylint * Unclear why the requirements.txt is still required :/ * Fix the code check python setup * Install all deps for pylint * Make the autoformat check depend on tests and doc updates workflows * Try installing dependencies in another order * Try again to install the deps * quoting the paths * Ad back the requirements * Try again to install rest_api and ui * Change deps group * Duplicate haystack install line * See if the cache is the problem * Disable also in mypy, who knows * split the install step * Split install step everywhere * Revert "Separate out code&docs check from tests" This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd. * Add back the action * Proactive support for audio (see text2speech branch) * Fix label generator tests * Remove install of libsndfile1 on win temporarily * exclude audio tests on win * install ffmpeg for integration tests Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2022-06-07 09:23:03 +02:00
assert "sample_pdf_1.pdf is not one of ['de']." in caplog.text
2020-06-08 11:07:19 +02:00
@pytest.mark.unit
2021-02-12 13:38:54 +01:00
def test_docx_converter():
converter = DocxToTextConverter()
document = converter.convert(file_path=SAMPLES_PATH / "docx" / "sample_docx.docx")[0]
assert document.content.startswith("Sample Docx File")
2021-03-23 16:31:26 +01:00
@pytest.mark.unit
2021-03-23 16:31:26 +01:00
def test_markdown_converter():
converter = MarkdownConverter()
document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
assert document.content.startswith("\nWhat to build with Haystack")
assert "# git clone https://github.com/deepset-ai/haystack.git" not in document.content
@pytest.mark.unit
def test_markdown_converter_headline_extraction():
expected_headlines = [
("What to build with Haystack", 1),
("Core Features", 1),
("Quick Demo", 1),
("2nd level headline for testing purposes", 2),
("3rd level headline for testing purposes", 3),
]
converter = MarkdownConverter(extract_headlines=True, remove_code_snippets=False)
document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
# Check if correct number of headlines are extracted
assert len(document.meta["headlines"]) == 5
for extracted_headline, (expected_headline, expected_level) in zip(document.meta["headlines"], expected_headlines):
# Check if correct headline and level is extracted
assert extracted_headline["headline"] == expected_headline
assert extracted_headline["level"] == expected_level
# Check if correct start_idx is extracted
start_idx = extracted_headline["start_idx"]
hl_len = len(extracted_headline["headline"])
assert extracted_headline["headline"] == document.content[start_idx : start_idx + hl_len]
@pytest.mark.unit
def test_markdown_converter_frontmatter_to_meta():
converter = MarkdownConverter(add_frontmatter_to_meta=True)
document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
assert document.meta["type"] == "intro"
assert document.meta["date"] == "1.1.2023"
@pytest.mark.unit
def test_markdown_converter_remove_code_snippets():
converter = MarkdownConverter(remove_code_snippets=False)
document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
assert document.content.startswith("pip install farm-haystack")
def test_azure_converter():
# Check if Form Recognizer endpoint and credential key in environment variables
if "AZURE_FORMRECOGNIZER_ENDPOINT" in os.environ and "AZURE_FORMRECOGNIZER_KEY" in os.environ:
converter = AzureConverter(
endpoint=os.environ["AZURE_FORMRECOGNIZER_ENDPOINT"],
credential_key=os.environ["AZURE_FORMRECOGNIZER_KEY"],
save_json=True,
)
docs = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
assert len(docs) == 2
assert docs[0].content_type == "table"
assert docs[0].content.shape[0] == 4 # number of rows
assert docs[0].content.shape[1] == 5 # number of columns, Form Recognizer assumes there are 5 columns
assert list(docs[0].content.columns) == ["", "Column 1", "", "Column 2", "Column 3"]
assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "", "$6345.", ""]
assert (
docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
"standardized and their\nspecification is published only on "
"Adobe's website. Many of them are also not\nsupported by "
"popular third-party implementations of PDF."
)
assert docs[0].meta["following_context"] == ""
assert docs[0].meta["page"] == 1
assert docs[1].content_type == "text"
assert docs[1].content.startswith("A sample PDF file")
[CI Refactoring] Workflow refactoring (#2576) * Unify CI tests (from #2466) * Update Documentation & Code Style * Change folder names * Fix markers list * Remove marker 'slow', replaced with 'integration' * Soften children check * Start ES first so it has time to boot while Python is setup * Run the full workflow * Try to make pip upgrade on Windows * Set KG tests as integration * Update Documentation & Code Style * typo * faster pylint * Make Pylint use the cache * filter diff files for pylint * debug pylint statement * revert pylint changes * Remove path from asserted log (fails on Windows) * Skip preprocessor test on Windows * Tackling Windows specific failures * Fix pytest command for windows suites * Remove \ from command * Move poppler test into integration * Skip opensearch test on windows * Add tolerance in reader sas score for Windows * Another pytorch approx * Raise time limit for unit tests :( * Skip poppler test on Windows CI * Specify to pull with FF only in docs check * temporarily run the docs check immediately * Allow merge commit for now * Try without fetch depth * Accelerating test * Accelerating test * Add repository and ref alongside fetch-depth * Separate out code&docs check from tests * Use setup-python cache * Delete custom action * Remove the pull step in the docs check, will find a way to run on bot commits * Add requirements.txt in .github for caching * Actually install dependencies * Change deps group for pylint * Unclear why the requirements.txt is still required :/ * Fix the code check python setup * Install all deps for pylint * Make the autoformat check depend on tests and doc updates workflows * Try installing dependencies in another order * Try again to install the deps * quoting the paths * Ad back the requirements * Try again to install rest_api and ui * Change deps group * Duplicate haystack install line * See if the cache is the problem * Disable also in mypy, who knows * split the install step * Split install step everywhere * Revert "Separate out code&docs check from tests" This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd. * Add back the action * Proactive support for audio (see text2speech branch) * Fix label generator tests * Remove install of libsndfile1 on win temporarily * exclude audio tests on win * install ffmpeg for integration tests Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2022-06-07 09:23:03 +02:00
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI")
def test_parsr_converter():
converter = ParsrConverter()
docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf").absolute()))
assert len(docs) == 2
assert docs[0].content_type == "table"
assert docs[0].content.shape[0] == 4 # number of rows
assert docs[0].content.shape[1] == 4
assert list(docs[0].content.columns) == ["", "Column 1", "Column 2", "Column 3"]
assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "$6345.", ""]
assert (
docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
"standardized and their\nspecification is published only on "
"Adobe's website. Many of them are also not\nsupported by popular "
"third-party implementations of PDF."
)
assert docs[0].meta["following_context"] == ""
assert docs[0].meta["page"] == 1
assert docs[1].content_type == "text"
assert docs[1].content.startswith("A sample PDF file")
assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.")
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI")
def test_parsr_converter_headline_extraction():
expected_headlines = [
[("Lorem ipsum", 1), ("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2)],
[
("Lorem ipsum", 1),
("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit. Nunc ac faucibus odio.", 2),
("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2),
("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit.", 2),
("Maecenas mauris lectus, lobortis et purus mattis, blandit\ndictum tellus.", 2),
("In eleifend velit vitae libero sollicitudin euismod.", 2),
],
]
converter = ParsrConverter()
docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_4.pdf").absolute()))
assert len(docs) == 2
for doc, expectation in zip(docs, expected_headlines):
for extracted_headline, (expected_headline, expected_level) in zip(doc.meta["headlines"], expectation):
# Check if correct headline and level is extracted
assert extracted_headline["headline"] == expected_headline
assert extracted_headline["level"] == expected_level
# Check if correct start_idx is extracted
if doc.content_type == "text":
start_idx = extracted_headline["start_idx"]
hl_len = len(extracted_headline["headline"])
assert extracted_headline["headline"] == doc.content[start_idx : start_idx + hl_len]
@pytest.mark.unit
def test_id_hash_keys_from_pipeline_params():
doc_path = SAMPLES_PATH / "docs" / "doc_1.txt"
meta_1 = {"key": "a"}
meta_2 = {"key": "b"}
meta = [meta_1, meta_2]
converter = TextConverter()
output, _ = converter.run(file_paths=[doc_path, doc_path], meta=meta, id_hash_keys=["content", "meta"])
documents = output["documents"]
unique_ids = set(d.id for d in documents)
assert len(documents) == 2
assert len(unique_ids) == 2
@pytest.mark.unit
def write_as_csv(data: List[List[str]], file_path: Path):
with open(file_path, "w") as f:
writer = csv.writer(f)
writer.writerows(data)
@pytest.mark.unit
def test_csv_to_document_with_qa_headers(tmp_path):
node = CsvTextConverter()
csv_path = tmp_path / "csv_qa_with_headers.csv"
rows = [
["question", "answer"],
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
]
write_as_csv(rows, csv_path)
output, edge = node.run(file_paths=csv_path)
assert edge == "output_1"
assert "documents" in output
assert len(output["documents"]) == 1
doc = output["documents"][0]
assert isinstance(doc, Document)
assert doc.content == "What is Haystack ?"
assert doc.meta["answer"] == "Haystack is an NLP Framework to use transformers in your Applications."
@pytest.mark.unit
def test_csv_to_document_with_wrong_qa_headers(tmp_path):
node = CsvTextConverter()
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
rows = [
["wrong", "headers"],
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
]
write_as_csv(rows, csv_path)
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
node.run(file_paths=csv_path)
@pytest.mark.unit
def test_csv_to_document_with_one_wrong_qa_headers(tmp_path):
node = CsvTextConverter()
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
rows = [
["wrong", "answers"],
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
]
write_as_csv(rows, csv_path)
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
node.run(file_paths=csv_path)
@pytest.mark.unit
def test_csv_to_document_with_another_wrong_qa_headers(tmp_path):
node = CsvTextConverter()
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
rows = [
["question", "wrong"],
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
]
write_as_csv(rows, csv_path)
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
node.run(file_paths=csv_path)
@pytest.mark.unit
def test_csv_to_document_with_one_column(tmp_path):
node = CsvTextConverter()
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
rows = [["question"], ["What is Haystack ?"]]
write_as_csv(rows, csv_path)
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
node.run(file_paths=csv_path)
@pytest.mark.unit
def test_csv_to_document_with_three_columns(tmp_path):
node = CsvTextConverter()
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
rows = [
["question", "answer", "notes"],
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications.", "verified"],
]
write_as_csv(rows, csv_path)
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
node.run(file_paths=csv_path)
@pytest.mark.unit
def test_csv_to_document_many_files(tmp_path):
csv_paths = []
for i in range(5):
node = CsvTextConverter()
csv_path = tmp_path / f"{i}_csv_qa_with_headers.csv"
csv_paths.append(csv_path)
rows = [
["question", "answer"],
[
f"{i}. What is Haystack ?",
f"{i}. Haystack is an NLP Framework to use transformers in your Applications.",
],
]
write_as_csv(rows, csv_path)
output, edge = node.run(file_paths=csv_paths)
assert edge == "output_1"
assert "documents" in output
assert len(output["documents"]) == 5
for i in range(5):
doc = output["documents"][i]
assert isinstance(doc, Document)
assert doc.content == f"{i}. What is Haystack ?"
assert doc.meta["answer"] == f"{i}. Haystack is an NLP Framework to use transformers in your Applications."
@pytest.mark.unit
class TestJsonConverter:
JSON_FILE_NAME = "json_normal.json"
JSONL_FILE_NAME = "json_normal.jsonl"
JSON_SINGLE_LINE_FILE_NAME = "json_all_single.json"
JSONL_LIST_LINE_FILE_NAME = "json_list_line.jsonl"
JSON_INVALID = "json_invalid.json"
@classmethod
@pytest.fixture(autouse=True)
def setup_class(cls, tmp_path):
# Setup the documents
# Note: We are tying the behavior of `JsonConverter`
# to that of the `to_dict()` method on the `Document`
documents = [
Document(
content=pd.DataFrame(
[["C", "Yes", "No"], ["Haskell", "No", "No"], ["Python", "Yes", "Yes"]],
columns=["Language", "Imperative", "OO"],
),
content_type="table",
meta={"context": "Programming Languages", "page": 2},
),
Document(
content="Programming languages are used for controlling the behavior of a machine (often a computer).",
content_type="text",
meta={"context": "Programming Languages", "page": 1},
),
Document(
content=pd.DataFrame(
[["C", 1, 1], ["Python", 6, 6.5]], columns=["Language", "Statements ratio", "Line ratio"]
),
content_type="table",
meta={"context": "Expressiveness", "page": 3},
),
]
doc_dicts_list = [d.to_dict() for d in documents]
json_path = tmp_path / TestJsonConverter.JSON_FILE_NAME
with open(json_path, "w") as f:
json.dump(doc_dicts_list, f)
jsonl_path = tmp_path / TestJsonConverter.JSONL_FILE_NAME
with open(jsonl_path, "w") as f:
for doc in doc_dicts_list:
f.write(json.dumps(doc) + "\n")
# json but everything written in a single line
json_single_path = tmp_path / TestJsonConverter.JSON_SINGLE_LINE_FILE_NAME
with open(json_single_path, "w") as f:
f.write(json.dumps(doc_dicts_list))
# Two lines (jsonl) but each line contains a list of dict instead of dict
jsonl_list_line_path = tmp_path / TestJsonConverter.JSONL_LIST_LINE_FILE_NAME
with open(jsonl_list_line_path, "w") as f:
for doc in [doc_dicts_list[:2], doc_dicts_list[2:3]]:
f.write(json.dumps(doc) + "\n")
json_invalid_path = tmp_path / TestJsonConverter.JSON_INVALID
with open(json_invalid_path, "w") as f:
f.write("{an invalid json string}")
def _assert_docs_okay(self, docs):
# Two table docs and one text doc
# [table, text, table]
assert len(docs) == 3
assert all(doc.meta["topic"] == "programming" for doc in docs)
# "context" in metadata should have been overwritten to be "PL" instead of "Programming Languages"
assert all(doc.meta["context"] == "PL" for doc in docs)
assert all(d.content_type == expected for d, expected in zip(docs, ("table", "text", "table")))
# Text doc test
assert (
docs[1].content
== "Programming languages are used for controlling the behavior of a machine (often a computer)."
)
# Table doc tests
assert isinstance(docs[0].content, pd.DataFrame)
assert docs[0].content.shape == (3, 3)
assert isinstance(docs[2].content, pd.DataFrame)
assert docs[2].content.shape == (2, 3)
def test_json_to_documents(self, tmp_path):
json_path = tmp_path / TestJsonConverter.JSON_FILE_NAME
converter = JsonConverter()
docs = converter.convert(json_path, meta={"topic": "programming", "context": "PL"})
self._assert_docs_okay(docs)
def test_json_to_documents_single_line(self, tmp_path):
json_path = tmp_path / TestJsonConverter.JSON_SINGLE_LINE_FILE_NAME
converter = JsonConverter()
docs = converter.convert(json_path, meta={"topic": "programming", "context": "PL"})
self._assert_docs_okay(docs)
def test_jsonl_to_documents(self, tmp_path):
jsonl_path = tmp_path / TestJsonConverter.JSONL_FILE_NAME
converter = JsonConverter()
docs = converter.convert(jsonl_path, meta={"topic": "programming", "context": "PL"})
self._assert_docs_okay(docs)
def test_jsonl_to_documents_list_line(self, tmp_path):
jsonl_path = tmp_path / TestJsonConverter.JSONL_LIST_LINE_FILE_NAME
converter = JsonConverter()
docs = converter.convert(jsonl_path, meta={"topic": "programming", "context": "PL"})
self._assert_docs_okay(docs)
def test_json_invalid(self, tmp_path):
json_path = tmp_path / TestJsonConverter.JSON_INVALID
converter = JsonConverter()
with pytest.raises(json.JSONDecodeError) as excinfo:
converter.convert(json_path)
# Assert filename is in the error message
assert TestJsonConverter.JSON_INVALID in str(excinfo.value)