mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-25 18:00:28 +00:00

* feat: add automatic OCR detection mechanism and improve performance * refactor: add error message * refactor: ignore pdftoppm bad typing * refactor: add Tesseract install. docstrings * fix: check if OCR var. assigned on mp * tests: add path to windows/linux tests * tests: add tessdata path * tests: include matrix ref. * tests: custom Tesseract matrix install * refactor: improve user guide * tests: fix macos path * tests: remove brew formulae version * fix: macos paths * tests: fix macos path * tests: add Tesseract to Windows Path * tests: pytesseract path * tests: macos path * refactor: fix path message and remove extra path from tests * refactor: raise exception when path not found * refactor: expression simplification Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * refactor: check ocr parameter * tests: mark as integration * tests: mock deprecation warning * refactor: simplify code Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * refactor: change deprecation test Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * refactor: add unit patch * refactor: black formatting --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Mayank Jobanputra <mayankjobanputra@gmail.com>
664 lines
25 KiB
Python
664 lines
25 KiB
Python
import csv
|
||
import json
|
||
import os
|
||
import subprocess
|
||
import sys
|
||
import warnings
|
||
from pathlib import Path
|
||
from typing import List
|
||
from unittest.mock import patch
|
||
|
||
import pandas as pd
|
||
import pytest
|
||
|
||
from haystack import Document
|
||
from haystack.nodes import (
|
||
AzureConverter,
|
||
CsvTextConverter,
|
||
DocxToTextConverter,
|
||
JsonConverter,
|
||
MarkdownConverter,
|
||
ParsrConverter,
|
||
PDFToTextConverter,
|
||
PDFToTextOCRConverter,
|
||
PreProcessor,
|
||
TextConverter,
|
||
TikaConverter,
|
||
)
|
||
|
||
from ..conftest import SAMPLES_PATH, fail_at_version
|
||
|
||
|
||
@pytest.mark.tika
|
||
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter])
|
||
def test_convert(Converter):
|
||
converter = Converter()
|
||
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]["documents"][0]
|
||
pages = document.content.split("\f")
|
||
|
||
assert (
|
||
len(pages) != 1 and pages[0] != ""
|
||
), f'{type(converter).__name__} did return a single empty page indicating a potential issue with your installed poppler version. Try installing via "conda install -c conda-forge poppler" and check test_pdftoppm_command_format()'
|
||
|
||
assert len(pages) == 4 # the sample PDF file has four pages.
|
||
assert pages[0] != "" # the page 1 of PDF contains text.
|
||
assert pages[2] == "" # the page 3 of PDF file is empty.
|
||
# assert text is retained from the document.
|
||
# As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace
|
||
page_standard_whitespace = " ".join(pages[0].split())
|
||
assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace
|
||
|
||
|
||
# Marked as integration because it uses poppler, which is not installed in the unit tests suite
|
||
@pytest.mark.integration
|
||
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Poppler not installed on Windows CI")
|
||
def test_pdftoppm_command_format():
|
||
# Haystack's PDFToTextOCRConverter uses pdf2image, which calls pdftoppm internally.
|
||
# Some installations of pdftoppm are incompatible with Haystack and won't raise an error but just return empty converted documents
|
||
# This test runs pdftoppm directly to check whether pdftoppm accepts the command format that pdf2image uses in Haystack
|
||
proc = subprocess.Popen(
|
||
["pdftoppm", f"{SAMPLES_PATH}/pdf/sample_pdf_1.pdf"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||
)
|
||
out, err = proc.communicate()
|
||
# If usage info of pdftoppm is sent to stderr then it's because Haystack's pdf2image uses an incompatible command format
|
||
assert (
|
||
not err
|
||
), 'Your installation of poppler is incompatible with Haystack. Try installing via "conda install -c conda-forge poppler"'
|
||
|
||
|
||
@pytest.mark.unit
|
||
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
|
||
def test_pdf_command_whitespaces(Converter):
|
||
converter = Converter()
|
||
|
||
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample pdf file with spaces on file name.pdf")[0][
|
||
"documents"
|
||
][0]
|
||
assert "ɪ" in document.content
|
||
|
||
|
||
@pytest.mark.unit
|
||
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
|
||
def test_pdf_encoding(Converter):
|
||
converter = Converter()
|
||
|
||
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_5.pdf")[0]["documents"][0]
|
||
assert "Ж" in document.content
|
||
|
||
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
|
||
assert "ɪ" in document.content
|
||
|
||
|
||
@pytest.mark.unit
|
||
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
|
||
def test_pdf_sort_by_position(Converter):
|
||
converter = Converter(sort_by_position=True)
|
||
|
||
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_3.pdf")[0]
|
||
assert str(document.content).startswith("This is the second test sentence.")
|
||
|
||
|
||
@pytest.mark.unit
|
||
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
|
||
def test_pdf_ligatures(Converter):
|
||
converter = Converter()
|
||
|
||
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
|
||
assert "ff" not in document.content
|
||
assert "ɪ" in document.content
|
||
|
||
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={})[0]["documents"][
|
||
0
|
||
]
|
||
assert "ff" in document.content
|
||
assert "ɪ" in document.content
|
||
|
||
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={"ɪ": "i"})[0][
|
||
"documents"
|
||
][0]
|
||
assert "ff" in document.content
|
||
assert "ɪ" not in document.content
|
||
|
||
|
||
@pytest.mark.unit
|
||
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
|
||
def test_pdf_page_range(Converter):
|
||
converter = Converter()
|
||
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]
|
||
pages = document.content.split("\f")
|
||
|
||
assert (
|
||
len(pages) == 4
|
||
) # the sample PDF file has four pages, we skipped first (but we wanna correct number of pages)
|
||
assert pages[0] == "" # the page 1 was skipped.
|
||
assert pages[1] != "" # the page 2 is not empty.
|
||
assert pages[2] == "" # the page 3 is empty.
|
||
|
||
|
||
@pytest.mark.unit
|
||
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
|
||
def test_pdf_page_range_numbers(Converter):
|
||
converter = Converter()
|
||
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]
|
||
|
||
preprocessor = PreProcessor(
|
||
split_by="word", split_length=5, split_overlap=0, split_respect_sentence_boundary=False, add_page_number=True
|
||
)
|
||
documents = preprocessor.process([document])
|
||
|
||
assert documents[1].meta["page"] == 4
|
||
|
||
|
||
@pytest.mark.unit
|
||
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
|
||
def test_pdf_parallel(Converter):
|
||
converter = Converter(multiprocessing=True)
|
||
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_6.pdf")[0]
|
||
|
||
pages = document.content.split("\f")
|
||
|
||
assert pages[0] == "This is the page 1 of the document."
|
||
assert pages[-1] == "This is the page 50 of the document."
|
||
|
||
|
||
@pytest.mark.unit
|
||
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
|
||
def test_pdf_parallel_page_range(Converter):
|
||
converter = Converter(multiprocessing=True)
|
||
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_6.pdf", start_page=2)[0]
|
||
|
||
pages = document.content.split("\f")
|
||
|
||
assert pages[0] == ""
|
||
assert len(pages) == 50
|
||
|
||
|
||
@pytest.mark.unit
|
||
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
|
||
def test_pdf_parallel_sort_by_position(Converter):
|
||
converter = Converter(multiprocessing=True, sort_by_position=True)
|
||
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_6.pdf")[0]
|
||
|
||
pages = document.content.split("\f")
|
||
|
||
assert pages[0] == "This is the page 1 of the document."
|
||
assert pages[-1] == "This is the page 50 of the document."
|
||
|
||
|
||
@pytest.mark.integration
|
||
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
|
||
def test_pdf_parallel_ocr(Converter):
|
||
converter = Converter(multiprocessing=True, sort_by_position=True, ocr="full", ocr_language="eng")
|
||
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_6.pdf")[0]
|
||
|
||
pages = document.content.split("\f")
|
||
|
||
assert pages[0] == "This is the page 1 of the document."
|
||
assert pages[-1] == "This is the page 50 of the document."
|
||
|
||
|
||
@pytest.mark.unit
|
||
@fail_at_version(1, 17)
|
||
@patch("haystack.nodes.file_converter.image.ImageToTextConverter.__new__")
|
||
def test_deprecated_ocr_node(mock):
|
||
with pytest.warns(DeprecationWarning):
|
||
PDFToTextOCRConverter()
|
||
|
||
|
||
@fail_at_version(1, 17)
|
||
def test_deprecated_encoding():
|
||
with pytest.warns(DeprecationWarning):
|
||
converter = PDFToTextConverter(encoding="utf-8")
|
||
|
||
|
||
@fail_at_version(1, 17)
|
||
def test_deprecated_encoding_in_convert_method():
|
||
converter = PDFToTextConverter()
|
||
with pytest.warns(DeprecationWarning):
|
||
converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", encoding="utf-8")
|
||
|
||
|
||
@fail_at_version(1, 17)
|
||
def test_deprecated_keep_physical_layout():
|
||
with pytest.warns(DeprecationWarning):
|
||
converter = PDFToTextConverter(keep_physical_layout=True)
|
||
|
||
|
||
@fail_at_version(1, 17)
|
||
def test_deprecated_keep_physical_layout_in_convert_method():
|
||
converter = PDFToTextConverter()
|
||
with pytest.warns(DeprecationWarning):
|
||
converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", keep_physical_layout=True)
|
||
|
||
|
||
@pytest.mark.tika
|
||
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
|
||
def test_table_removal(Converter):
|
||
converter = Converter(remove_numeric_tables=True)
|
||
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]
|
||
pages = document.content.split("\f")
|
||
# assert numeric rows are removed from the table.
|
||
assert "324" not in pages[0]
|
||
assert "54x growth" not in pages[0]
|
||
|
||
|
||
@pytest.mark.tika
|
||
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
|
||
def test_language_validation(Converter, caplog):
|
||
converter = Converter(valid_languages=["en"])
|
||
converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
|
||
assert "sample_pdf_1.pdf is not one of ['en']." not in caplog.text
|
||
|
||
converter = Converter(valid_languages=["de"])
|
||
converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
|
||
assert "sample_pdf_1.pdf is not one of ['de']." in caplog.text
|
||
|
||
|
||
@pytest.mark.unit
|
||
def test_docx_converter():
|
||
converter = DocxToTextConverter()
|
||
document = converter.convert(file_path=SAMPLES_PATH / "docx" / "sample_docx.docx")[0]
|
||
assert document.content.startswith("Sample Docx File")
|
||
|
||
|
||
@pytest.mark.unit
|
||
def test_markdown_converter():
|
||
converter = MarkdownConverter()
|
||
document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
|
||
assert document.content.startswith("\nWhat to build with Haystack")
|
||
assert "# git clone https://github.com/deepset-ai/haystack.git" not in document.content
|
||
|
||
|
||
@pytest.mark.unit
|
||
def test_markdown_converter_headline_extraction():
|
||
expected_headlines = [
|
||
("What to build with Haystack", 1),
|
||
("Core Features", 1),
|
||
("Quick Demo", 1),
|
||
("2nd level headline for testing purposes", 2),
|
||
("3rd level headline for testing purposes", 3),
|
||
]
|
||
|
||
converter = MarkdownConverter(extract_headlines=True, remove_code_snippets=False)
|
||
document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
|
||
|
||
# Check if correct number of headlines are extracted
|
||
assert len(document.meta["headlines"]) == 5
|
||
for extracted_headline, (expected_headline, expected_level) in zip(document.meta["headlines"], expected_headlines):
|
||
# Check if correct headline and level is extracted
|
||
assert extracted_headline["headline"] == expected_headline
|
||
assert extracted_headline["level"] == expected_level
|
||
|
||
# Check if correct start_idx is extracted
|
||
start_idx = extracted_headline["start_idx"]
|
||
hl_len = len(extracted_headline["headline"])
|
||
assert extracted_headline["headline"] == document.content[start_idx : start_idx + hl_len]
|
||
|
||
|
||
@pytest.mark.unit
|
||
def test_markdown_converter_frontmatter_to_meta():
|
||
converter = MarkdownConverter(add_frontmatter_to_meta=True)
|
||
document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
|
||
assert document.meta["type"] == "intro"
|
||
assert document.meta["date"] == "1.1.2023"
|
||
|
||
|
||
@pytest.mark.unit
|
||
def test_markdown_converter_remove_code_snippets():
|
||
converter = MarkdownConverter(remove_code_snippets=False)
|
||
document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
|
||
assert document.content.startswith("pip install farm-haystack")
|
||
|
||
|
||
def test_azure_converter():
|
||
# Check if Form Recognizer endpoint and credential key in environment variables
|
||
if "AZURE_FORMRECOGNIZER_ENDPOINT" in os.environ and "AZURE_FORMRECOGNIZER_KEY" in os.environ:
|
||
converter = AzureConverter(
|
||
endpoint=os.environ["AZURE_FORMRECOGNIZER_ENDPOINT"],
|
||
credential_key=os.environ["AZURE_FORMRECOGNIZER_KEY"],
|
||
save_json=True,
|
||
)
|
||
|
||
docs = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
|
||
assert len(docs) == 2
|
||
assert docs[0].content_type == "table"
|
||
assert docs[0].content.shape[0] == 4 # number of rows
|
||
assert docs[0].content.shape[1] == 5 # number of columns, Form Recognizer assumes there are 5 columns
|
||
assert list(docs[0].content.columns) == ["", "Column 1", "", "Column 2", "Column 3"]
|
||
assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "", "$6345.", ""]
|
||
assert (
|
||
docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
|
||
"standardized and their\nspecification is published only on "
|
||
"Adobe's website. Many of them are also not\nsupported by "
|
||
"popular third-party implementations of PDF."
|
||
)
|
||
assert docs[0].meta["following_context"] == ""
|
||
assert docs[0].meta["page"] == 1
|
||
|
||
assert docs[1].content_type == "text"
|
||
assert docs[1].content.startswith("A sample PDF file")
|
||
|
||
|
||
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI")
|
||
def test_parsr_converter():
|
||
converter = ParsrConverter()
|
||
|
||
docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf").absolute()))
|
||
assert len(docs) == 2
|
||
assert docs[0].content_type == "table"
|
||
assert docs[0].content.shape[0] == 4 # number of rows
|
||
assert docs[0].content.shape[1] == 4
|
||
assert list(docs[0].content.columns) == ["", "Column 1", "Column 2", "Column 3"]
|
||
assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "$6345.", ""]
|
||
assert (
|
||
docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
|
||
"standardized and their\nspecification is published only on "
|
||
"Adobe's website. Many of them are also not\nsupported by popular "
|
||
"third-party implementations of PDF."
|
||
)
|
||
assert docs[0].meta["following_context"] == ""
|
||
assert docs[0].meta["page"] == 1
|
||
|
||
assert docs[1].content_type == "text"
|
||
assert docs[1].content.startswith("A sample PDF file")
|
||
assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.")
|
||
|
||
|
||
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI")
|
||
def test_parsr_converter_headline_extraction():
|
||
expected_headlines = [
|
||
[("Lorem ipsum", 1), ("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2)],
|
||
[
|
||
("Lorem ipsum", 1),
|
||
("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit. Nunc ac faucibus odio.", 2),
|
||
("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2),
|
||
("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit.", 2),
|
||
("Maecenas mauris lectus, lobortis et purus mattis, blandit\ndictum tellus.", 2),
|
||
("In eleifend velit vitae libero sollicitudin euismod.", 2),
|
||
],
|
||
]
|
||
|
||
converter = ParsrConverter()
|
||
|
||
docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_4.pdf").absolute()))
|
||
assert len(docs) == 2
|
||
|
||
for doc, expectation in zip(docs, expected_headlines):
|
||
for extracted_headline, (expected_headline, expected_level) in zip(doc.meta["headlines"], expectation):
|
||
# Check if correct headline and level is extracted
|
||
assert extracted_headline["headline"] == expected_headline
|
||
assert extracted_headline["level"] == expected_level
|
||
|
||
# Check if correct start_idx is extracted
|
||
if doc.content_type == "text":
|
||
start_idx = extracted_headline["start_idx"]
|
||
hl_len = len(extracted_headline["headline"])
|
||
assert extracted_headline["headline"] == doc.content[start_idx : start_idx + hl_len]
|
||
|
||
|
||
@pytest.mark.unit
|
||
def test_id_hash_keys_from_pipeline_params():
|
||
doc_path = SAMPLES_PATH / "docs" / "doc_1.txt"
|
||
meta_1 = {"key": "a"}
|
||
meta_2 = {"key": "b"}
|
||
meta = [meta_1, meta_2]
|
||
|
||
converter = TextConverter()
|
||
output, _ = converter.run(file_paths=[doc_path, doc_path], meta=meta, id_hash_keys=["content", "meta"])
|
||
documents = output["documents"]
|
||
unique_ids = set(d.id for d in documents)
|
||
|
||
assert len(documents) == 2
|
||
assert len(unique_ids) == 2
|
||
|
||
|
||
@pytest.mark.unit
|
||
def write_as_csv(data: List[List[str]], file_path: Path):
|
||
with open(file_path, "w") as f:
|
||
writer = csv.writer(f)
|
||
writer.writerows(data)
|
||
|
||
|
||
@pytest.mark.unit
|
||
def test_csv_to_document_with_qa_headers(tmp_path):
|
||
node = CsvTextConverter()
|
||
csv_path = tmp_path / "csv_qa_with_headers.csv"
|
||
rows = [
|
||
["question", "answer"],
|
||
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
|
||
]
|
||
write_as_csv(rows, csv_path)
|
||
|
||
output, edge = node.run(file_paths=csv_path)
|
||
assert edge == "output_1"
|
||
assert "documents" in output
|
||
assert len(output["documents"]) == 1
|
||
|
||
doc = output["documents"][0]
|
||
assert isinstance(doc, Document)
|
||
assert doc.content == "What is Haystack ?"
|
||
assert doc.meta["answer"] == "Haystack is an NLP Framework to use transformers in your Applications."
|
||
|
||
|
||
@pytest.mark.unit
|
||
def test_csv_to_document_with_wrong_qa_headers(tmp_path):
|
||
node = CsvTextConverter()
|
||
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
|
||
rows = [
|
||
["wrong", "headers"],
|
||
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
|
||
]
|
||
write_as_csv(rows, csv_path)
|
||
|
||
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
|
||
node.run(file_paths=csv_path)
|
||
|
||
|
||
@pytest.mark.unit
|
||
def test_csv_to_document_with_one_wrong_qa_headers(tmp_path):
|
||
node = CsvTextConverter()
|
||
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
|
||
rows = [
|
||
["wrong", "answers"],
|
||
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
|
||
]
|
||
write_as_csv(rows, csv_path)
|
||
|
||
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
|
||
node.run(file_paths=csv_path)
|
||
|
||
|
||
@pytest.mark.unit
|
||
def test_csv_to_document_with_another_wrong_qa_headers(tmp_path):
|
||
node = CsvTextConverter()
|
||
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
|
||
rows = [
|
||
["question", "wrong"],
|
||
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
|
||
]
|
||
write_as_csv(rows, csv_path)
|
||
|
||
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
|
||
node.run(file_paths=csv_path)
|
||
|
||
|
||
@pytest.mark.unit
|
||
def test_csv_to_document_with_one_column(tmp_path):
|
||
node = CsvTextConverter()
|
||
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
|
||
rows = [["question"], ["What is Haystack ?"]]
|
||
write_as_csv(rows, csv_path)
|
||
|
||
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
|
||
node.run(file_paths=csv_path)
|
||
|
||
|
||
@pytest.mark.unit
|
||
def test_csv_to_document_with_three_columns(tmp_path):
|
||
node = CsvTextConverter()
|
||
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
|
||
rows = [
|
||
["question", "answer", "notes"],
|
||
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications.", "verified"],
|
||
]
|
||
write_as_csv(rows, csv_path)
|
||
|
||
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
|
||
node.run(file_paths=csv_path)
|
||
|
||
|
||
@pytest.mark.unit
|
||
def test_csv_to_document_many_files(tmp_path):
|
||
csv_paths = []
|
||
for i in range(5):
|
||
node = CsvTextConverter()
|
||
csv_path = tmp_path / f"{i}_csv_qa_with_headers.csv"
|
||
csv_paths.append(csv_path)
|
||
rows = [
|
||
["question", "answer"],
|
||
[
|
||
f"{i}. What is Haystack ?",
|
||
f"{i}. Haystack is an NLP Framework to use transformers in your Applications.",
|
||
],
|
||
]
|
||
write_as_csv(rows, csv_path)
|
||
|
||
output, edge = node.run(file_paths=csv_paths)
|
||
assert edge == "output_1"
|
||
assert "documents" in output
|
||
assert len(output["documents"]) == 5
|
||
|
||
for i in range(5):
|
||
doc = output["documents"][i]
|
||
assert isinstance(doc, Document)
|
||
assert doc.content == f"{i}. What is Haystack ?"
|
||
assert doc.meta["answer"] == f"{i}. Haystack is an NLP Framework to use transformers in your Applications."
|
||
|
||
|
||
@pytest.mark.unit
|
||
class TestJsonConverter:
|
||
JSON_FILE_NAME = "json_normal.json"
|
||
JSONL_FILE_NAME = "json_normal.jsonl"
|
||
JSON_SINGLE_LINE_FILE_NAME = "json_all_single.json"
|
||
JSONL_LIST_LINE_FILE_NAME = "json_list_line.jsonl"
|
||
JSON_INVALID = "json_invalid.json"
|
||
|
||
@classmethod
|
||
@pytest.fixture(autouse=True)
|
||
def setup_class(cls, tmp_path):
|
||
# Setup the documents
|
||
# Note: We are tying the behavior of `JsonConverter`
|
||
# to that of the `to_dict()` method on the `Document`
|
||
documents = [
|
||
Document(
|
||
content=pd.DataFrame(
|
||
[["C", "Yes", "No"], ["Haskell", "No", "No"], ["Python", "Yes", "Yes"]],
|
||
columns=["Language", "Imperative", "OO"],
|
||
),
|
||
content_type="table",
|
||
meta={"context": "Programming Languages", "page": 2},
|
||
),
|
||
Document(
|
||
content="Programming languages are used for controlling the behavior of a machine (often a computer).",
|
||
content_type="text",
|
||
meta={"context": "Programming Languages", "page": 1},
|
||
),
|
||
Document(
|
||
content=pd.DataFrame(
|
||
[["C", 1, 1], ["Python", 6, 6.5]], columns=["Language", "Statements ratio", "Line ratio"]
|
||
),
|
||
content_type="table",
|
||
meta={"context": "Expressiveness", "page": 3},
|
||
),
|
||
]
|
||
|
||
doc_dicts_list = [d.to_dict() for d in documents]
|
||
|
||
json_path = tmp_path / TestJsonConverter.JSON_FILE_NAME
|
||
with open(json_path, "w") as f:
|
||
json.dump(doc_dicts_list, f)
|
||
|
||
jsonl_path = tmp_path / TestJsonConverter.JSONL_FILE_NAME
|
||
with open(jsonl_path, "w") as f:
|
||
for doc in doc_dicts_list:
|
||
f.write(json.dumps(doc) + "\n")
|
||
|
||
# json but everything written in a single line
|
||
json_single_path = tmp_path / TestJsonConverter.JSON_SINGLE_LINE_FILE_NAME
|
||
with open(json_single_path, "w") as f:
|
||
f.write(json.dumps(doc_dicts_list))
|
||
|
||
# Two lines (jsonl) but each line contains a list of dict instead of dict
|
||
jsonl_list_line_path = tmp_path / TestJsonConverter.JSONL_LIST_LINE_FILE_NAME
|
||
with open(jsonl_list_line_path, "w") as f:
|
||
for doc in [doc_dicts_list[:2], doc_dicts_list[2:3]]:
|
||
f.write(json.dumps(doc) + "\n")
|
||
|
||
json_invalid_path = tmp_path / TestJsonConverter.JSON_INVALID
|
||
with open(json_invalid_path, "w") as f:
|
||
f.write("{an invalid json string}")
|
||
|
||
def _assert_docs_okay(self, docs):
|
||
# Two table docs and one text doc
|
||
# [table, text, table]
|
||
assert len(docs) == 3
|
||
assert all(doc.meta["topic"] == "programming" for doc in docs)
|
||
# "context" in metadata should have been overwritten to be "PL" instead of "Programming Languages"
|
||
assert all(doc.meta["context"] == "PL" for doc in docs)
|
||
assert all(d.content_type == expected for d, expected in zip(docs, ("table", "text", "table")))
|
||
|
||
# Text doc test
|
||
assert (
|
||
docs[1].content
|
||
== "Programming languages are used for controlling the behavior of a machine (often a computer)."
|
||
)
|
||
|
||
# Table doc tests
|
||
assert isinstance(docs[0].content, pd.DataFrame)
|
||
assert docs[0].content.shape == (3, 3)
|
||
|
||
assert isinstance(docs[2].content, pd.DataFrame)
|
||
assert docs[2].content.shape == (2, 3)
|
||
|
||
def test_json_to_documents(self, tmp_path):
|
||
json_path = tmp_path / TestJsonConverter.JSON_FILE_NAME
|
||
|
||
converter = JsonConverter()
|
||
docs = converter.convert(json_path, meta={"topic": "programming", "context": "PL"})
|
||
|
||
self._assert_docs_okay(docs)
|
||
|
||
def test_json_to_documents_single_line(self, tmp_path):
|
||
json_path = tmp_path / TestJsonConverter.JSON_SINGLE_LINE_FILE_NAME
|
||
|
||
converter = JsonConverter()
|
||
docs = converter.convert(json_path, meta={"topic": "programming", "context": "PL"})
|
||
|
||
self._assert_docs_okay(docs)
|
||
|
||
def test_jsonl_to_documents(self, tmp_path):
|
||
jsonl_path = tmp_path / TestJsonConverter.JSONL_FILE_NAME
|
||
|
||
converter = JsonConverter()
|
||
docs = converter.convert(jsonl_path, meta={"topic": "programming", "context": "PL"})
|
||
|
||
self._assert_docs_okay(docs)
|
||
|
||
def test_jsonl_to_documents_list_line(self, tmp_path):
|
||
jsonl_path = tmp_path / TestJsonConverter.JSONL_LIST_LINE_FILE_NAME
|
||
|
||
converter = JsonConverter()
|
||
docs = converter.convert(jsonl_path, meta={"topic": "programming", "context": "PL"})
|
||
|
||
self._assert_docs_okay(docs)
|
||
|
||
def test_json_invalid(self, tmp_path):
|
||
json_path = tmp_path / TestJsonConverter.JSON_INVALID
|
||
|
||
converter = JsonConverter()
|
||
with pytest.raises(json.JSONDecodeError) as excinfo:
|
||
converter.convert(json_path)
|
||
|
||
# Assert filename is in the error message
|
||
assert TestJsonConverter.JSON_INVALID in str(excinfo.value)
|