haystack/test/nodes/test_file_converter.py
Sara Zan 59608ca474
[CI Refactoring] Workflow refactoring (#2576)
* Unify CI tests (from #2466)

* Update Documentation & Code Style

* Change folder names

* Fix markers list

* Remove marker 'slow', replaced with 'integration'

* Soften children check

* Start ES first so it has time to boot while Python is setup

* Run the full workflow

* Try to make pip upgrade on Windows

* Set KG tests as integration

* Update Documentation & Code Style

* typo

* faster pylint

* Make Pylint use the cache

* filter diff files for pylint

* debug pylint statement

* revert pylint changes

* Remove path from asserted log (fails on Windows)

* Skip preprocessor test on Windows

* Tackling Windows specific failures

* Fix pytest command for windows suites

* Remove \ from command

* Move poppler test into integration

* Skip opensearch test on windows

* Add tolerance in reader sas score for Windows

* Another pytorch approx

* Raise time limit for unit tests :(

* Skip poppler test on Windows CI

* Specify to pull with FF only in docs check

* temporarily run the docs check immediately

* Allow merge commit for now

* Try without fetch depth

* Accelerating test

* Accelerating test

* Add repository and ref alongside fetch-depth

* Separate out code&docs check from tests

* Use setup-python cache

* Delete custom action

* Remove the pull step in the docs check, will find a way to run on bot commits

* Add requirements.txt in .github for caching

* Actually install dependencies

* Change deps group for pylint

* Unclear why the requirements.txt is still required :/

* Fix the code check python setup

* Install all deps for pylint

* Make the autoformat check depend on tests and doc updates workflows

* Try installing dependencies in another order

* Try again to install the deps

* quoting the paths

* Ad back the requirements

* Try again to install rest_api and ui

* Change deps group

* Duplicate haystack install line

* See if the cache is the problem

* Disable also in mypy, who knows

* split the install step

* Split install step everywhere

* Revert "Separate out code&docs check from tests"

This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd.

* Add back the action

* Proactive support for audio (see text2speech branch)

* Fix label generator tests

* Remove install of libsndfile1 on win temporarily

* exclude audio tests on win

* install ffmpeg for integration tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2022-06-07 09:23:03 +02:00

175 lines
7.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import sys
from pathlib import Path
import subprocess
import pytest
from haystack.nodes import (
MarkdownConverter,
DocxToTextConverter,
PDFToTextConverter,
PDFToTextOCRConverter,
TikaConverter,
AzureConverter,
ParsrConverter,
)
from ..conftest import SAMPLES_PATH
@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter])
def test_convert(Converter):
converter = Converter()
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]["documents"][0]
pages = document.content.split("\f")
assert (
len(pages) != 1 and pages[0] != ""
), f'{type(converter).__name__} did return a single empty page indicating a potential issue with your installed poppler version. Try installing via "conda install -c conda-forge poppler" and check test_pdftoppm_command_format()'
assert len(pages) == 4 # the sample PDF file has four pages.
assert pages[0] != "" # the page 1 of PDF contains text.
assert pages[2] == "" # the page 3 of PDF file is empty.
# assert text is retained from the document.
# As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace
page_standard_whitespace = " ".join(pages[0].split())
assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace
# Marked as integration because it uses poppler, which is not installed in the unit tests suite
@pytest.mark.integration
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Poppler not installed on Windows CI")
def test_pdftoppm_command_format():
# Haystack's PDFToTextOCRConverter uses pdf2image, which calls pdftoppm internally.
# Some installations of pdftoppm are incompatible with Haystack and won't raise an error but just return empty converted documents
# This test runs pdftoppm directly to check whether pdftoppm accepts the command format that pdf2image uses in Haystack
proc = subprocess.Popen(
["pdftoppm", f"{SAMPLES_PATH}/pdf/sample_pdf_1.pdf"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
out, err = proc.communicate()
# If usage info of pdftoppm is sent to stderr then it's because Haystack's pdf2image uses an incompatible command format
assert (
not err
), 'Your installation of poppler is incompatible with Haystack. Try installing via "conda install -c conda-forge poppler"'
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_encoding(Converter):
converter = Converter()
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
assert "ɪ" in document.content
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", encoding="Latin1")[0]["documents"][0]
assert "ɪ" not in document.content
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_ligatures(Converter):
converter = Converter()
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
assert "" not in document.content
assert "ɪ" in document.content
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={})[0]["documents"][
0
]
assert "" in document.content
assert "ɪ" in document.content
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={"ɪ": "i"})[0][
"documents"
][0]
assert "" in document.content
assert "ɪ" not in document.content
@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_table_removal(Converter):
converter = Converter(remove_numeric_tables=True)
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]
pages = document.content.split("\f")
# assert numeric rows are removed from the table.
assert "324" not in pages[0]
assert "54x growth" not in pages[0]
@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_language_validation(Converter, caplog):
converter = Converter(valid_languages=["en"])
converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
assert "sample_pdf_1.pdf is not one of ['en']." not in caplog.text
converter = Converter(valid_languages=["de"])
converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
assert "sample_pdf_1.pdf is not one of ['de']." in caplog.text
def test_docx_converter():
converter = DocxToTextConverter()
document = converter.convert(file_path=SAMPLES_PATH / "docx" / "sample_docx.docx")[0]
assert document.content.startswith("Sample Docx File")
def test_markdown_converter():
converter = MarkdownConverter()
document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
assert document.content.startswith("What to build with Haystack")
def test_azure_converter():
# Check if Form Recognizer endpoint and credential key in environment variables
if "AZURE_FORMRECOGNIZER_ENDPOINT" in os.environ and "AZURE_FORMRECOGNIZER_KEY" in os.environ:
converter = AzureConverter(
endpoint=os.environ["AZURE_FORMRECOGNIZER_ENDPOINT"],
credential_key=os.environ["AZURE_FORMRECOGNIZER_KEY"],
save_json=True,
)
docs = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
assert len(docs) == 2
assert docs[0].content_type == "table"
assert docs[0].content.shape[0] == 4 # number of rows
assert docs[0].content.shape[1] == 5 # number of columns, Form Recognizer assumes there are 5 columns
assert list(docs[0].content.columns) == ["", "Column 1", "", "Column 2", "Column 3"]
assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "", "$6345.", ""]
assert (
docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
"standardized and their\nspecification is published only on "
"Adobe's website. Many of them are also not\nsupported by "
"popular third-party implementations of PDF."
)
assert docs[0].meta["following_context"] == ""
assert docs[1].content_type == "text"
assert docs[1].content.startswith("A sample PDF file")
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI")
def test_parsr_converter():
converter = ParsrConverter()
docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf").absolute()))
assert len(docs) == 2
assert docs[0].content_type == "table"
assert docs[0].content.shape[0] == 4 # number of rows
assert docs[0].content.shape[1] == 4
assert list(docs[0].content.columns) == ["", "Column 1", "Column 2", "Column 3"]
assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "$6345.", ""]
assert (
docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
"standardized and their\nspecification is published only on "
"Adobe's website. Many of them are also not\nsupported by popular "
"third-party implementations of PDF."
)
assert docs[0].meta["following_context"] == ""
assert docs[1].content_type == "text"
assert docs[1].content.startswith("A sample PDF file")
assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.")