2023-01-23 15:56:36 +01:00
from typing import List
2021-11-29 18:44:20 +01:00
import os
2022-06-07 09:23:03 +02:00
import sys
from pathlib import Path
2022-05-24 11:31:32 +02:00
import subprocess
2023-01-23 15:56:36 +01:00
import csv
2023-02-21 09:23:42 +01:00
import json
2020-06-08 11:07:19 +02:00
2023-02-21 09:23:42 +01:00
import pandas as pd
2020-08-17 11:21:09 +02:00
import pytest
2020-08-17 11:13:52 +02:00
2023-01-23 15:56:36 +01:00
from haystack import Document
2022-02-03 13:43:18 +01:00
from haystack . nodes import (
MarkdownConverter ,
DocxToTextConverter ,
PDFToTextConverter ,
PDFToTextOCRConverter ,
TikaConverter ,
AzureConverter ,
ParsrConverter ,
2022-06-24 09:55:09 +02:00
TextConverter ,
2023-01-23 15:56:36 +01:00
CsvTextConverter ,
2023-02-21 09:23:42 +01:00
JsonConverter ,
2023-01-30 10:09:22 -03:00
PreProcessor ,
2022-02-03 13:43:18 +01:00
)
2020-06-08 11:07:19 +02:00
2023-03-01 18:34:38 -03:00
from . . conftest import SAMPLES_PATH , fail_at_version
2022-01-26 18:12:55 +01:00
2020-10-30 18:06:02 +01:00
@pytest.mark.tika
2022-05-24 11:31:32 +02:00
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter , TikaConverter , PDFToTextOCRConverter ] )
2021-10-29 13:52:28 +05:30
def test_convert ( Converter ) :
2020-08-17 11:21:09 +02:00
converter = Converter ( )
2022-05-24 11:31:32 +02:00
document = converter . run ( file_paths = SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " ) [ 0 ] [ " documents " ] [ 0 ]
2022-03-29 13:53:35 +02:00
pages = document . content . split ( " \f " )
2022-05-24 11:31:32 +02:00
assert (
len ( pages ) != 1 and pages [ 0 ] != " "
) , f ' { type ( converter ) . __name__ } did return a single empty page indicating a potential issue with your installed poppler version. Try installing via " conda install -c conda-forge poppler " and check test_pdftoppm_command_format() '
2020-06-08 11:07:19 +02:00
assert len ( pages ) == 4 # the sample PDF file has four pages.
assert pages [ 0 ] != " " # the page 1 of PDF contains text.
assert pages [ 2 ] == " " # the page 3 of PDF file is empty.
2021-09-01 16:42:25 +02:00
# assert text is retained from the document.
# As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace
page_standard_whitespace = " " . join ( pages [ 0 ] . split ( ) )
2022-02-03 13:43:18 +01:00
assert " Adobe Systems made the PDF specification available free of charge in 1993. " in page_standard_whitespace
2020-06-08 11:07:19 +02:00
2022-06-07 09:23:03 +02:00
# Marked as integration because it uses poppler, which is not installed in the unit tests suite
@pytest.mark.integration
@pytest.mark.skipif ( sys . platform in [ " win32 " , " cygwin " ] , reason = " Poppler not installed on Windows CI " )
2022-05-24 11:31:32 +02:00
def test_pdftoppm_command_format ( ) :
# Haystack's PDFToTextOCRConverter uses pdf2image, which calls pdftoppm internally.
# Some installations of pdftoppm are incompatible with Haystack and won't raise an error but just return empty converted documents
# This test runs pdftoppm directly to check whether pdftoppm accepts the command format that pdf2image uses in Haystack
proc = subprocess . Popen (
[ " pdftoppm " , f " { SAMPLES_PATH } /pdf/sample_pdf_1.pdf " ] , stdout = subprocess . PIPE , stderr = subprocess . PIPE
)
out , err = proc . communicate ( )
# If usage info of pdftoppm is sent to stderr then it's because Haystack's pdf2image uses an incompatible command format
assert (
not err
) , ' Your installation of poppler is incompatible with Haystack. Try installing via " conda install -c conda-forge poppler " '
2023-03-01 18:34:38 -03:00
@pytest.mark.unit
2022-07-11 08:30:33 -03:00
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter ] )
def test_pdf_command_whitespaces ( Converter ) :
converter = Converter ( )
document = converter . run ( file_paths = SAMPLES_PATH / " pdf " / " sample pdf file with spaces on file name.pdf " ) [ 0 ] [
" documents "
] [ 0 ]
assert " ɪ " in document . content
2023-03-01 18:34:38 -03:00
@pytest.mark.unit
2022-05-24 11:31:32 +02:00
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter ] )
2022-05-04 17:01:45 +02:00
def test_pdf_encoding ( Converter ) :
converter = Converter ( )
2023-03-01 18:34:38 -03:00
document = converter . run ( file_paths = SAMPLES_PATH / " pdf " / " sample_pdf_5.pdf " ) [ 0 ] [ " documents " ] [ 0 ]
assert " Ж " in document . content
2022-05-04 17:01:45 +02:00
document = converter . run ( file_paths = SAMPLES_PATH / " pdf " / " sample_pdf_2.pdf " ) [ 0 ] [ " documents " ] [ 0 ]
assert " ɪ " in document . content
2023-03-01 18:34:38 -03:00
@pytest.mark.unit
2022-09-13 11:55:21 -03:00
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter ] )
2023-03-01 18:34:38 -03:00
def test_pdf_sort_by_position ( Converter ) :
converter = Converter ( sort_by_position = True )
2022-09-13 11:55:21 -03:00
document = converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_3.pdf " ) [ 0 ]
assert str ( document . content ) . startswith ( " This is the second test sentence. " )
2023-03-01 18:34:38 -03:00
@pytest.mark.unit
2022-05-24 11:31:32 +02:00
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter ] )
2022-05-04 17:01:45 +02:00
def test_pdf_ligatures ( Converter ) :
converter = Converter ( )
document = converter . run ( file_paths = SAMPLES_PATH / " pdf " / " sample_pdf_2.pdf " ) [ 0 ] [ " documents " ] [ 0 ]
assert " ff " not in document . content
assert " ɪ " in document . content
document = converter . run ( file_paths = SAMPLES_PATH / " pdf " / " sample_pdf_2.pdf " , known_ligatures = { } ) [ 0 ] [ " documents " ] [
0
]
assert " ff " in document . content
assert " ɪ " in document . content
document = converter . run ( file_paths = SAMPLES_PATH / " pdf " / " sample_pdf_2.pdf " , known_ligatures = { " ɪ " : " i " } ) [ 0 ] [
" documents "
] [ 0 ]
assert " ff " in document . content
assert " ɪ " not in document . content
2023-03-01 18:34:38 -03:00
@pytest.mark.unit
2023-01-30 10:09:22 -03:00
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter ] )
2023-03-01 18:34:38 -03:00
def test_pdf_page_range ( Converter ) :
2023-01-30 10:09:22 -03:00
converter = Converter ( )
document = converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " , start_page = 2 ) [ 0 ]
pages = document . content . split ( " \f " )
assert (
len ( pages ) == 4
) # the sample PDF file has four pages, we skipped first (but we wanna correct number of pages)
assert pages [ 0 ] == " " # the page 1 was skipped.
assert pages [ 1 ] != " " # the page 2 is not empty.
assert pages [ 2 ] == " " # the page 3 is empty.
2023-03-01 18:34:38 -03:00
@pytest.mark.unit
2023-01-30 10:09:22 -03:00
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter ] )
2023-03-01 18:34:38 -03:00
def test_pdf_page_range_numbers ( Converter ) :
2023-01-30 10:09:22 -03:00
converter = Converter ( )
document = converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " , start_page = 2 ) [ 0 ]
preprocessor = PreProcessor (
split_by = " word " , split_length = 5 , split_overlap = 0 , split_respect_sentence_boundary = False , add_page_number = True
)
documents = preprocessor . process ( [ document ] )
assert documents [ 1 ] . meta [ " page " ] == 4
2023-03-01 18:34:38 -03:00
@pytest.mark.unit
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter ] )
def test_pdf_parallel ( Converter ) :
converter = Converter ( multiprocessing = True )
document = converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_6.pdf " ) [ 0 ]
pages = document . content . split ( " \f " )
assert pages [ 0 ] == " This is the page 1 of the document. "
assert pages [ - 1 ] == " This is the page 50 of the document. "
@pytest.mark.unit
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter ] )
def test_pdf_parallel_page_range ( Converter ) :
converter = Converter ( multiprocessing = True )
document = converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_6.pdf " , start_page = 2 ) [ 0 ]
pages = document . content . split ( " \f " )
assert pages [ 0 ] == " "
assert len ( pages ) == 50
@pytest.mark.unit
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter ] )
def test_pdf_parallel_sort_by_position ( Converter ) :
converter = Converter ( multiprocessing = True , sort_by_position = True )
document = converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_6.pdf " ) [ 0 ]
pages = document . content . split ( " \f " )
assert pages [ 0 ] == " This is the page 1 of the document. "
assert pages [ - 1 ] == " This is the page 50 of the document. "
@fail_at_version ( 1 , 17 )
def test_deprecated_encoding ( ) :
with pytest . warns ( DeprecationWarning ) :
converter = PDFToTextConverter ( encoding = " utf-8 " )
@fail_at_version ( 1 , 17 )
def test_deprecated_encoding_in_convert_method ( ) :
converter = PDFToTextConverter ( )
with pytest . warns ( DeprecationWarning ) :
converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " , encoding = " utf-8 " )
@fail_at_version ( 1 , 17 )
def test_deprecated_keep_physical_layout ( ) :
with pytest . warns ( DeprecationWarning ) :
converter = PDFToTextConverter ( keep_physical_layout = True )
@fail_at_version ( 1 , 17 )
def test_deprecated_keep_physical_layout_in_convert_method ( ) :
converter = PDFToTextConverter ( )
with pytest . warns ( DeprecationWarning ) :
converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " , keep_physical_layout = True )
2020-10-30 18:06:02 +01:00
@pytest.mark.tika
2020-08-17 11:21:09 +02:00
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter , TikaConverter ] )
2021-10-29 13:52:28 +05:30
def test_table_removal ( Converter ) :
2020-08-17 11:21:09 +02:00
converter = Converter ( remove_numeric_tables = True )
2022-02-03 13:43:18 +01:00
document = converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " ) [ 0 ]
2022-03-29 13:53:35 +02:00
pages = document . content . split ( " \f " )
2020-06-08 11:07:19 +02:00
# assert numeric rows are removed from the table.
assert " 324 " not in pages [ 0 ]
assert " 54x growth " not in pages [ 0 ]
2020-10-30 18:06:02 +01:00
@pytest.mark.tika
2020-08-17 11:21:09 +02:00
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter , TikaConverter ] )
2021-10-29 13:52:28 +05:30
def test_language_validation ( Converter , caplog ) :
2020-08-17 11:21:09 +02:00
converter = Converter ( valid_languages = [ " en " ] )
2022-02-03 13:43:18 +01:00
converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " )
2022-06-07 09:23:03 +02:00
assert " sample_pdf_1.pdf is not one of [ ' en ' ]. " not in caplog . text
2020-06-08 11:07:19 +02:00
2020-08-17 11:21:09 +02:00
converter = Converter ( valid_languages = [ " de " ] )
2022-02-03 13:43:18 +01:00
converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " )
2022-06-07 09:23:03 +02:00
assert " sample_pdf_1.pdf is not one of [ ' de ' ]. " in caplog . text
2020-06-08 11:07:19 +02:00
2023-02-27 15:00:19 +01:00
@pytest.mark.unit
2021-02-12 13:38:54 +01:00
def test_docx_converter ( ) :
converter = DocxToTextConverter ( )
2022-02-03 13:43:18 +01:00
document = converter . convert ( file_path = SAMPLES_PATH / " docx " / " sample_docx.docx " ) [ 0 ]
2022-03-29 13:53:35 +02:00
assert document . content . startswith ( " Sample Docx File " )
2021-03-23 16:31:26 +01:00
2023-02-27 15:00:19 +01:00
@pytest.mark.unit
2021-03-23 16:31:26 +01:00
def test_markdown_converter ( ) :
converter = MarkdownConverter ( )
2022-02-03 13:43:18 +01:00
document = converter . convert ( file_path = SAMPLES_PATH / " markdown " / " sample.md " ) [ 0 ]
2023-01-27 15:25:54 +01:00
assert document . content . startswith ( " \n What to build with Haystack " )
assert " # git clone https://github.com/deepset-ai/haystack.git " not in document . content
2021-11-29 18:44:20 +01:00
2023-02-27 15:00:19 +01:00
@pytest.mark.unit
2022-10-26 11:57:55 +02:00
def test_markdown_converter_headline_extraction ( ) :
expected_headlines = [
( " What to build with Haystack " , 1 ) ,
( " Core Features " , 1 ) ,
( " Quick Demo " , 1 ) ,
( " 2nd level headline for testing purposes " , 2 ) ,
( " 3rd level headline for testing purposes " , 3 ) ,
]
converter = MarkdownConverter ( extract_headlines = True , remove_code_snippets = False )
document = converter . convert ( file_path = SAMPLES_PATH / " markdown " / " sample.md " ) [ 0 ]
# Check if correct number of headlines are extracted
assert len ( document . meta [ " headlines " ] ) == 5
for extracted_headline , ( expected_headline , expected_level ) in zip ( document . meta [ " headlines " ] , expected_headlines ) :
# Check if correct headline and level is extracted
assert extracted_headline [ " headline " ] == expected_headline
assert extracted_headline [ " level " ] == expected_level
# Check if correct start_idx is extracted
start_idx = extracted_headline [ " start_idx " ]
hl_len = len ( extracted_headline [ " headline " ] )
assert extracted_headline [ " headline " ] == document . content [ start_idx : start_idx + hl_len ]
2023-02-27 15:00:19 +01:00
@pytest.mark.unit
2023-01-26 17:15:02 +01:00
def test_markdown_converter_frontmatter_to_meta ( ) :
converter = MarkdownConverter ( add_frontmatter_to_meta = True )
document = converter . convert ( file_path = SAMPLES_PATH / " markdown " / " sample.md " ) [ 0 ]
assert document . meta [ " type " ] == " intro "
assert document . meta [ " date " ] == " 1.1.2023 "
2023-02-27 15:00:19 +01:00
@pytest.mark.unit
2023-01-27 15:25:54 +01:00
def test_markdown_converter_remove_code_snippets ( ) :
converter = MarkdownConverter ( remove_code_snippets = False )
document = converter . convert ( file_path = SAMPLES_PATH / " markdown " / " sample.md " ) [ 0 ]
assert document . content . startswith ( " pip install farm-haystack " )
2021-11-29 18:44:20 +01:00
def test_azure_converter ( ) :
# Check if Form Recognizer endpoint and credential key in environment variables
if " AZURE_FORMRECOGNIZER_ENDPOINT " in os . environ and " AZURE_FORMRECOGNIZER_KEY " in os . environ :
2022-02-03 13:43:18 +01:00
converter = AzureConverter (
endpoint = os . environ [ " AZURE_FORMRECOGNIZER_ENDPOINT " ] ,
credential_key = os . environ [ " AZURE_FORMRECOGNIZER_KEY " ] ,
save_json = True ,
)
2021-11-29 18:44:20 +01:00
2022-02-03 13:43:18 +01:00
docs = converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " )
2021-11-29 18:44:20 +01:00
assert len ( docs ) == 2
2022-03-29 13:53:35 +02:00
assert docs [ 0 ] . content_type == " table "
assert docs [ 0 ] . content . shape [ 0 ] == 4 # number of rows
assert docs [ 0 ] . content . shape [ 1 ] == 5 # number of columns, Form Recognizer assumes there are 5 columns
assert list ( docs [ 0 ] . content . columns ) == [ " " , " Column 1 " , " " , " Column 2 " , " Column 3 " ]
assert list ( docs [ 0 ] . content . iloc [ 3 ] ) == [ " D " , " $54.35 " , " " , " $6345. " , " " ]
2022-02-24 14:58:36 +01:00
assert (
2022-03-29 13:53:35 +02:00
docs [ 0 ] . meta [ " preceding_context " ] == " specification. These proprietary technologies are not "
2022-02-24 14:58:36 +01:00
" standardized and their \n specification is published only on "
" Adobe ' s website. Many of them are also not \n supported by "
" popular third-party implementations of PDF. "
)
2022-03-29 13:53:35 +02:00
assert docs [ 0 ] . meta [ " following_context " ] == " "
2022-08-09 15:55:27 +02:00
assert docs [ 0 ] . meta [ " page " ] == 1
2021-11-29 18:44:20 +01:00
2022-03-29 13:53:35 +02:00
assert docs [ 1 ] . content_type == " text "
assert docs [ 1 ] . content . startswith ( " A sample PDF file " )
2021-12-30 10:15:11 +01:00
2022-06-07 09:23:03 +02:00
@pytest.mark.skipif ( sys . platform in [ " win32 " , " cygwin " ] , reason = " Parsr not running on Windows CI " )
2021-12-30 10:15:11 +01:00
def test_parsr_converter ( ) :
converter = ParsrConverter ( )
2022-02-03 13:43:18 +01:00
docs = converter . convert ( file_path = str ( ( SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " ) . absolute ( ) ) )
2021-12-30 10:15:11 +01:00
assert len ( docs ) == 2
2022-03-29 13:53:35 +02:00
assert docs [ 0 ] . content_type == " table "
assert docs [ 0 ] . content . shape [ 0 ] == 4 # number of rows
assert docs [ 0 ] . content . shape [ 1 ] == 4
assert list ( docs [ 0 ] . content . columns ) == [ " " , " Column 1 " , " Column 2 " , " Column 3 " ]
assert list ( docs [ 0 ] . content . iloc [ 3 ] ) == [ " D " , " $54.35 " , " $6345. " , " " ]
2022-02-24 14:58:36 +01:00
assert (
2022-03-29 13:53:35 +02:00
docs [ 0 ] . meta [ " preceding_context " ] == " specification. These proprietary technologies are not "
2022-02-24 14:58:36 +01:00
" standardized and their \n specification is published only on "
" Adobe ' s website. Many of them are also not \n supported by popular "
" third-party implementations of PDF. "
)
2022-03-29 13:53:35 +02:00
assert docs [ 0 ] . meta [ " following_context " ] == " "
2022-08-09 15:55:27 +02:00
assert docs [ 0 ] . meta [ " page " ] == 1
2021-12-30 10:15:11 +01:00
2022-03-29 13:53:35 +02:00
assert docs [ 1 ] . content_type == " text "
assert docs [ 1 ] . content . startswith ( " A sample PDF file " )
assert docs [ 1 ] . content . endswith ( " Page 4 of Sample PDF \n … the page 3 is empty. " )
2022-06-24 09:55:09 +02:00
2022-10-31 19:00:02 +01:00
@pytest.mark.skipif ( sys . platform in [ " win32 " , " cygwin " ] , reason = " Parsr not running on Windows CI " )
def test_parsr_converter_headline_extraction ( ) :
expected_headlines = [
[ ( " Lorem ipsum " , 1 ) , ( " Cras fringilla ipsum magna, in fringilla dui commodo \n a. " , 2 ) ] ,
[
( " Lorem ipsum " , 1 ) ,
( " Lorem ipsum dolor sit amet, consectetur adipiscing \n elit. Nunc ac faucibus odio. " , 2 ) ,
( " Cras fringilla ipsum magna, in fringilla dui commodo \n a. " , 2 ) ,
( " Lorem ipsum dolor sit amet, consectetur adipiscing \n elit. " , 2 ) ,
( " Maecenas mauris lectus, lobortis et purus mattis, blandit \n dictum tellus. " , 2 ) ,
( " In eleifend velit vitae libero sollicitudin euismod. " , 2 ) ,
] ,
]
converter = ParsrConverter ( )
docs = converter . convert ( file_path = str ( ( SAMPLES_PATH / " pdf " / " sample_pdf_4.pdf " ) . absolute ( ) ) )
assert len ( docs ) == 2
for doc , expectation in zip ( docs , expected_headlines ) :
for extracted_headline , ( expected_headline , expected_level ) in zip ( doc . meta [ " headlines " ] , expectation ) :
# Check if correct headline and level is extracted
assert extracted_headline [ " headline " ] == expected_headline
assert extracted_headline [ " level " ] == expected_level
# Check if correct start_idx is extracted
if doc . content_type == " text " :
start_idx = extracted_headline [ " start_idx " ]
hl_len = len ( extracted_headline [ " headline " ] )
assert extracted_headline [ " headline " ] == doc . content [ start_idx : start_idx + hl_len ]
2023-02-27 15:00:19 +01:00
@pytest.mark.unit
2022-06-24 09:55:09 +02:00
def test_id_hash_keys_from_pipeline_params ( ) :
doc_path = SAMPLES_PATH / " docs " / " doc_1.txt "
meta_1 = { " key " : " a " }
meta_2 = { " key " : " b " }
meta = [ meta_1 , meta_2 ]
converter = TextConverter ( )
output , _ = converter . run ( file_paths = [ doc_path , doc_path ] , meta = meta , id_hash_keys = [ " content " , " meta " ] )
documents = output [ " documents " ]
unique_ids = set ( d . id for d in documents )
assert len ( documents ) == 2
assert len ( unique_ids ) == 2
2023-01-23 15:56:36 +01:00
2023-02-27 15:00:19 +01:00
@pytest.mark.unit
2023-01-23 15:56:36 +01:00
def write_as_csv ( data : List [ List [ str ] ] , file_path : Path ) :
with open ( file_path , " w " ) as f :
writer = csv . writer ( f )
writer . writerows ( data )
2023-02-27 15:00:19 +01:00
@pytest.mark.unit
2023-01-23 15:56:36 +01:00
def test_csv_to_document_with_qa_headers ( tmp_path ) :
node = CsvTextConverter ( )
csv_path = tmp_path / " csv_qa_with_headers.csv "
rows = [
[ " question " , " answer " ] ,
[ " What is Haystack ? " , " Haystack is an NLP Framework to use transformers in your Applications. " ] ,
]
write_as_csv ( rows , csv_path )
output , edge = node . run ( file_paths = csv_path )
assert edge == " output_1 "
assert " documents " in output
assert len ( output [ " documents " ] ) == 1
doc = output [ " documents " ] [ 0 ]
assert isinstance ( doc , Document )
assert doc . content == " What is Haystack ? "
assert doc . meta [ " answer " ] == " Haystack is an NLP Framework to use transformers in your Applications. "
2023-02-27 15:00:19 +01:00
@pytest.mark.unit
2023-01-23 15:56:36 +01:00
def test_csv_to_document_with_wrong_qa_headers ( tmp_path ) :
node = CsvTextConverter ( )
csv_path = tmp_path / " csv_qa_with_wrong_headers.csv "
rows = [
[ " wrong " , " headers " ] ,
[ " What is Haystack ? " , " Haystack is an NLP Framework to use transformers in your Applications. " ] ,
]
write_as_csv ( rows , csv_path )
with pytest . raises ( ValueError , match = " The CSV must contain two columns named ' question ' and ' answer ' " ) :
node . run ( file_paths = csv_path )
2023-02-27 15:00:19 +01:00
@pytest.mark.unit
2023-01-23 15:56:36 +01:00
def test_csv_to_document_with_one_wrong_qa_headers ( tmp_path ) :
node = CsvTextConverter ( )
csv_path = tmp_path / " csv_qa_with_wrong_headers.csv "
rows = [
[ " wrong " , " answers " ] ,
[ " What is Haystack ? " , " Haystack is an NLP Framework to use transformers in your Applications. " ] ,
]
write_as_csv ( rows , csv_path )
with pytest . raises ( ValueError , match = " The CSV must contain two columns named ' question ' and ' answer ' " ) :
node . run ( file_paths = csv_path )
2023-02-27 15:00:19 +01:00
@pytest.mark.unit
2023-01-23 15:56:36 +01:00
def test_csv_to_document_with_another_wrong_qa_headers ( tmp_path ) :
node = CsvTextConverter ( )
csv_path = tmp_path / " csv_qa_with_wrong_headers.csv "
rows = [
[ " question " , " wrong " ] ,
[ " What is Haystack ? " , " Haystack is an NLP Framework to use transformers in your Applications. " ] ,
]
write_as_csv ( rows , csv_path )
with pytest . raises ( ValueError , match = " The CSV must contain two columns named ' question ' and ' answer ' " ) :
node . run ( file_paths = csv_path )
2023-02-27 15:00:19 +01:00
@pytest.mark.unit
2023-01-23 15:56:36 +01:00
def test_csv_to_document_with_one_column ( tmp_path ) :
node = CsvTextConverter ( )
csv_path = tmp_path / " csv_qa_with_wrong_headers.csv "
rows = [ [ " question " ] , [ " What is Haystack ? " ] ]
write_as_csv ( rows , csv_path )
with pytest . raises ( ValueError , match = " The CSV must contain two columns named ' question ' and ' answer ' " ) :
node . run ( file_paths = csv_path )
2023-02-27 15:00:19 +01:00
@pytest.mark.unit
2023-01-23 15:56:36 +01:00
def test_csv_to_document_with_three_columns ( tmp_path ) :
node = CsvTextConverter ( )
csv_path = tmp_path / " csv_qa_with_wrong_headers.csv "
rows = [
[ " question " , " answer " , " notes " ] ,
[ " What is Haystack ? " , " Haystack is an NLP Framework to use transformers in your Applications. " , " verified " ] ,
]
write_as_csv ( rows , csv_path )
with pytest . raises ( ValueError , match = " The CSV must contain two columns named ' question ' and ' answer ' " ) :
node . run ( file_paths = csv_path )
2023-02-27 15:00:19 +01:00
@pytest.mark.unit
2023-01-23 15:56:36 +01:00
def test_csv_to_document_many_files ( tmp_path ) :
csv_paths = [ ]
for i in range ( 5 ) :
node = CsvTextConverter ( )
csv_path = tmp_path / f " { i } _csv_qa_with_headers.csv "
csv_paths . append ( csv_path )
rows = [
[ " question " , " answer " ] ,
[
f " { i } . What is Haystack ? " ,
f " { i } . Haystack is an NLP Framework to use transformers in your Applications. " ,
] ,
]
write_as_csv ( rows , csv_path )
output , edge = node . run ( file_paths = csv_paths )
assert edge == " output_1 "
assert " documents " in output
assert len ( output [ " documents " ] ) == 5
for i in range ( 5 ) :
doc = output [ " documents " ] [ i ]
assert isinstance ( doc , Document )
assert doc . content == f " { i } . What is Haystack ? "
assert doc . meta [ " answer " ] == f " { i } . Haystack is an NLP Framework to use transformers in your Applications. "
2023-02-21 09:23:42 +01:00
2023-02-27 15:00:19 +01:00
@pytest.mark.unit
2023-02-21 09:23:42 +01:00
class TestJsonConverter :
JSON_FILE_NAME = " json_normal.json "
JSONL_FILE_NAME = " json_normal.jsonl "
JSON_SINGLE_LINE_FILE_NAME = " json_all_single.json "
JSONL_LIST_LINE_FILE_NAME = " json_list_line.jsonl "
JSON_INVALID = " json_invalid.json "
@classmethod
@pytest.fixture ( autouse = True )
def setup_class ( cls , tmp_path ) :
# Setup the documents
# Note: We are tying the behavior of `JsonConverter`
# to that of the `to_dict()` method on the `Document`
documents = [
Document (
content = pd . DataFrame (
[ [ " C " , " Yes " , " No " ] , [ " Haskell " , " No " , " No " ] , [ " Python " , " Yes " , " Yes " ] ] ,
columns = [ " Language " , " Imperative " , " OO " ] ,
) ,
content_type = " table " ,
meta = { " context " : " Programming Languages " , " page " : 2 } ,
) ,
Document (
content = " Programming languages are used for controlling the behavior of a machine (often a computer). " ,
content_type = " text " ,
meta = { " context " : " Programming Languages " , " page " : 1 } ,
) ,
Document (
content = pd . DataFrame (
[ [ " C " , 1 , 1 ] , [ " Python " , 6 , 6.5 ] ] , columns = [ " Language " , " Statements ratio " , " Line ratio " ]
) ,
content_type = " table " ,
meta = { " context " : " Expressiveness " , " page " : 3 } ,
) ,
]
doc_dicts_list = [ d . to_dict ( ) for d in documents ]
json_path = tmp_path / TestJsonConverter . JSON_FILE_NAME
with open ( json_path , " w " ) as f :
json . dump ( doc_dicts_list , f )
jsonl_path = tmp_path / TestJsonConverter . JSONL_FILE_NAME
with open ( jsonl_path , " w " ) as f :
for doc in doc_dicts_list :
f . write ( json . dumps ( doc ) + " \n " )
# json but everything written in a single line
json_single_path = tmp_path / TestJsonConverter . JSON_SINGLE_LINE_FILE_NAME
with open ( json_single_path , " w " ) as f :
f . write ( json . dumps ( doc_dicts_list ) )
# Two lines (jsonl) but each line contains a list of dict instead of dict
jsonl_list_line_path = tmp_path / TestJsonConverter . JSONL_LIST_LINE_FILE_NAME
with open ( jsonl_list_line_path , " w " ) as f :
for doc in [ doc_dicts_list [ : 2 ] , doc_dicts_list [ 2 : 3 ] ] :
f . write ( json . dumps ( doc ) + " \n " )
json_invalid_path = tmp_path / TestJsonConverter . JSON_INVALID
with open ( json_invalid_path , " w " ) as f :
f . write ( " { an invalid json string} " )
def _assert_docs_okay ( self , docs ) :
# Two table docs and one text doc
# [table, text, table]
assert len ( docs ) == 3
assert all ( doc . meta [ " topic " ] == " programming " for doc in docs )
# "context" in metadata should have been overwritten to be "PL" instead of "Programming Languages"
assert all ( doc . meta [ " context " ] == " PL " for doc in docs )
assert all ( d . content_type == expected for d , expected in zip ( docs , ( " table " , " text " , " table " ) ) )
# Text doc test
assert (
docs [ 1 ] . content
== " Programming languages are used for controlling the behavior of a machine (often a computer). "
)
# Table doc tests
assert isinstance ( docs [ 0 ] . content , pd . DataFrame )
assert docs [ 0 ] . content . shape == ( 3 , 3 )
assert isinstance ( docs [ 2 ] . content , pd . DataFrame )
assert docs [ 2 ] . content . shape == ( 2 , 3 )
def test_json_to_documents ( self , tmp_path ) :
json_path = tmp_path / TestJsonConverter . JSON_FILE_NAME
converter = JsonConverter ( )
docs = converter . convert ( json_path , meta = { " topic " : " programming " , " context " : " PL " } )
self . _assert_docs_okay ( docs )
def test_json_to_documents_single_line ( self , tmp_path ) :
json_path = tmp_path / TestJsonConverter . JSON_SINGLE_LINE_FILE_NAME
converter = JsonConverter ( )
docs = converter . convert ( json_path , meta = { " topic " : " programming " , " context " : " PL " } )
self . _assert_docs_okay ( docs )
def test_jsonl_to_documents ( self , tmp_path ) :
jsonl_path = tmp_path / TestJsonConverter . JSONL_FILE_NAME
converter = JsonConverter ( )
docs = converter . convert ( jsonl_path , meta = { " topic " : " programming " , " context " : " PL " } )
self . _assert_docs_okay ( docs )
def test_jsonl_to_documents_list_line ( self , tmp_path ) :
jsonl_path = tmp_path / TestJsonConverter . JSONL_LIST_LINE_FILE_NAME
converter = JsonConverter ( )
docs = converter . convert ( jsonl_path , meta = { " topic " : " programming " , " context " : " PL " } )
self . _assert_docs_okay ( docs )
def test_json_invalid ( self , tmp_path ) :
json_path = tmp_path / TestJsonConverter . JSON_INVALID
converter = JsonConverter ( )
with pytest . raises ( json . JSONDecodeError ) as excinfo :
converter . convert ( json_path )
# Assert filename is in the error message
assert TestJsonConverter . JSON_INVALID in str ( excinfo . value )