2021-11-29 18:44:20 +01:00
import os
2022-06-07 09:23:03 +02:00
import sys
from pathlib import Path
2022-05-24 11:31:32 +02:00
import subprocess
2020-06-08 11:07:19 +02:00
2020-08-17 11:21:09 +02:00
import pytest
2020-08-17 11:13:52 +02:00
2022-02-03 13:43:18 +01:00
from haystack . nodes import (
MarkdownConverter ,
DocxToTextConverter ,
PDFToTextConverter ,
PDFToTextOCRConverter ,
TikaConverter ,
AzureConverter ,
ParsrConverter ,
2022-06-24 09:55:09 +02:00
TextConverter ,
2022-02-03 13:43:18 +01:00
)
2020-06-08 11:07:19 +02:00
2022-05-17 10:55:53 +02:00
from . . conftest import SAMPLES_PATH
2022-01-26 18:12:55 +01:00
2020-10-30 18:06:02 +01:00
@pytest.mark.tika
2022-05-24 11:31:32 +02:00
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter , TikaConverter , PDFToTextOCRConverter ] )
2021-10-29 13:52:28 +05:30
def test_convert ( Converter ) :
2020-08-17 11:21:09 +02:00
converter = Converter ( )
2022-05-24 11:31:32 +02:00
document = converter . run ( file_paths = SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " ) [ 0 ] [ " documents " ] [ 0 ]
2022-03-29 13:53:35 +02:00
pages = document . content . split ( " \f " )
2022-05-24 11:31:32 +02:00
assert (
len ( pages ) != 1 and pages [ 0 ] != " "
) , f ' { type ( converter ) . __name__ } did return a single empty page indicating a potential issue with your installed poppler version. Try installing via " conda install -c conda-forge poppler " and check test_pdftoppm_command_format() '
2020-06-08 11:07:19 +02:00
assert len ( pages ) == 4 # the sample PDF file has four pages.
assert pages [ 0 ] != " " # the page 1 of PDF contains text.
assert pages [ 2 ] == " " # the page 3 of PDF file is empty.
2021-09-01 16:42:25 +02:00
# assert text is retained from the document.
# As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace
page_standard_whitespace = " " . join ( pages [ 0 ] . split ( ) )
2022-02-03 13:43:18 +01:00
assert " Adobe Systems made the PDF specification available free of charge in 1993. " in page_standard_whitespace
2020-06-08 11:07:19 +02:00
2022-06-07 09:23:03 +02:00
# Marked as integration because it uses poppler, which is not installed in the unit tests suite
@pytest.mark.integration
@pytest.mark.skipif ( sys . platform in [ " win32 " , " cygwin " ] , reason = " Poppler not installed on Windows CI " )
2022-05-24 11:31:32 +02:00
def test_pdftoppm_command_format ( ) :
# Haystack's PDFToTextOCRConverter uses pdf2image, which calls pdftoppm internally.
# Some installations of pdftoppm are incompatible with Haystack and won't raise an error but just return empty converted documents
# This test runs pdftoppm directly to check whether pdftoppm accepts the command format that pdf2image uses in Haystack
proc = subprocess . Popen (
[ " pdftoppm " , f " { SAMPLES_PATH } /pdf/sample_pdf_1.pdf " ] , stdout = subprocess . PIPE , stderr = subprocess . PIPE
)
out , err = proc . communicate ( )
# If usage info of pdftoppm is sent to stderr then it's because Haystack's pdf2image uses an incompatible command format
assert (
not err
) , ' Your installation of poppler is incompatible with Haystack. Try installing via " conda install -c conda-forge poppler " '
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter ] )
2022-05-04 17:01:45 +02:00
def test_pdf_encoding ( Converter ) :
converter = Converter ( )
document = converter . run ( file_paths = SAMPLES_PATH / " pdf " / " sample_pdf_2.pdf " ) [ 0 ] [ " documents " ] [ 0 ]
assert " ɪ " in document . content
document = converter . run ( file_paths = SAMPLES_PATH / " pdf " / " sample_pdf_2.pdf " , encoding = " Latin1 " ) [ 0 ] [ " documents " ] [ 0 ]
assert " ɪ " not in document . content
2022-05-24 11:31:32 +02:00
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter ] )
2022-05-04 17:01:45 +02:00
def test_pdf_ligatures ( Converter ) :
converter = Converter ( )
document = converter . run ( file_paths = SAMPLES_PATH / " pdf " / " sample_pdf_2.pdf " ) [ 0 ] [ " documents " ] [ 0 ]
assert " ff " not in document . content
assert " ɪ " in document . content
document = converter . run ( file_paths = SAMPLES_PATH / " pdf " / " sample_pdf_2.pdf " , known_ligatures = { } ) [ 0 ] [ " documents " ] [
0
]
assert " ff " in document . content
assert " ɪ " in document . content
document = converter . run ( file_paths = SAMPLES_PATH / " pdf " / " sample_pdf_2.pdf " , known_ligatures = { " ɪ " : " i " } ) [ 0 ] [
" documents "
] [ 0 ]
assert " ff " in document . content
assert " ɪ " not in document . content
2020-10-30 18:06:02 +01:00
@pytest.mark.tika
2020-08-17 11:21:09 +02:00
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter , TikaConverter ] )
2021-10-29 13:52:28 +05:30
def test_table_removal ( Converter ) :
2020-08-17 11:21:09 +02:00
converter = Converter ( remove_numeric_tables = True )
2022-02-03 13:43:18 +01:00
document = converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " ) [ 0 ]
2022-03-29 13:53:35 +02:00
pages = document . content . split ( " \f " )
2020-06-08 11:07:19 +02:00
# assert numeric rows are removed from the table.
assert " 324 " not in pages [ 0 ]
assert " 54x growth " not in pages [ 0 ]
2020-10-30 18:06:02 +01:00
@pytest.mark.tika
2020-08-17 11:21:09 +02:00
@pytest.mark.parametrize ( " Converter " , [ PDFToTextConverter , TikaConverter ] )
2021-10-29 13:52:28 +05:30
def test_language_validation ( Converter , caplog ) :
2020-08-17 11:21:09 +02:00
converter = Converter ( valid_languages = [ " en " ] )
2022-02-03 13:43:18 +01:00
converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " )
2022-06-07 09:23:03 +02:00
assert " sample_pdf_1.pdf is not one of [ ' en ' ]. " not in caplog . text
2020-06-08 11:07:19 +02:00
2020-08-17 11:21:09 +02:00
converter = Converter ( valid_languages = [ " de " ] )
2022-02-03 13:43:18 +01:00
converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " )
2022-06-07 09:23:03 +02:00
assert " sample_pdf_1.pdf is not one of [ ' de ' ]. " in caplog . text
2020-06-08 11:07:19 +02:00
2021-02-12 13:38:54 +01:00
def test_docx_converter ( ) :
converter = DocxToTextConverter ( )
2022-02-03 13:43:18 +01:00
document = converter . convert ( file_path = SAMPLES_PATH / " docx " / " sample_docx.docx " ) [ 0 ]
2022-03-29 13:53:35 +02:00
assert document . content . startswith ( " Sample Docx File " )
2021-03-23 16:31:26 +01:00
def test_markdown_converter ( ) :
converter = MarkdownConverter ( )
2022-02-03 13:43:18 +01:00
document = converter . convert ( file_path = SAMPLES_PATH / " markdown " / " sample.md " ) [ 0 ]
2022-03-29 13:53:35 +02:00
assert document . content . startswith ( " What to build with Haystack " )
2021-11-29 18:44:20 +01:00
def test_azure_converter ( ) :
# Check if Form Recognizer endpoint and credential key in environment variables
if " AZURE_FORMRECOGNIZER_ENDPOINT " in os . environ and " AZURE_FORMRECOGNIZER_KEY " in os . environ :
2022-02-03 13:43:18 +01:00
converter = AzureConverter (
endpoint = os . environ [ " AZURE_FORMRECOGNIZER_ENDPOINT " ] ,
credential_key = os . environ [ " AZURE_FORMRECOGNIZER_KEY " ] ,
save_json = True ,
)
2021-11-29 18:44:20 +01:00
2022-02-03 13:43:18 +01:00
docs = converter . convert ( file_path = SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " )
2021-11-29 18:44:20 +01:00
assert len ( docs ) == 2
2022-03-29 13:53:35 +02:00
assert docs [ 0 ] . content_type == " table "
assert docs [ 0 ] . content . shape [ 0 ] == 4 # number of rows
assert docs [ 0 ] . content . shape [ 1 ] == 5 # number of columns, Form Recognizer assumes there are 5 columns
assert list ( docs [ 0 ] . content . columns ) == [ " " , " Column 1 " , " " , " Column 2 " , " Column 3 " ]
assert list ( docs [ 0 ] . content . iloc [ 3 ] ) == [ " D " , " $54.35 " , " " , " $6345. " , " " ]
2022-02-24 14:58:36 +01:00
assert (
2022-03-29 13:53:35 +02:00
docs [ 0 ] . meta [ " preceding_context " ] == " specification. These proprietary technologies are not "
2022-02-24 14:58:36 +01:00
" standardized and their \n specification is published only on "
" Adobe ' s website. Many of them are also not \n supported by "
" popular third-party implementations of PDF. "
)
2022-03-29 13:53:35 +02:00
assert docs [ 0 ] . meta [ " following_context " ] == " "
2021-11-29 18:44:20 +01:00
2022-03-29 13:53:35 +02:00
assert docs [ 1 ] . content_type == " text "
assert docs [ 1 ] . content . startswith ( " A sample PDF file " )
2021-12-30 10:15:11 +01:00
2022-06-07 09:23:03 +02:00
@pytest.mark.skipif ( sys . platform in [ " win32 " , " cygwin " ] , reason = " Parsr not running on Windows CI " )
2021-12-30 10:15:11 +01:00
def test_parsr_converter ( ) :
converter = ParsrConverter ( )
2022-02-03 13:43:18 +01:00
docs = converter . convert ( file_path = str ( ( SAMPLES_PATH / " pdf " / " sample_pdf_1.pdf " ) . absolute ( ) ) )
2021-12-30 10:15:11 +01:00
assert len ( docs ) == 2
2022-03-29 13:53:35 +02:00
assert docs [ 0 ] . content_type == " table "
assert docs [ 0 ] . content . shape [ 0 ] == 4 # number of rows
assert docs [ 0 ] . content . shape [ 1 ] == 4
assert list ( docs [ 0 ] . content . columns ) == [ " " , " Column 1 " , " Column 2 " , " Column 3 " ]
assert list ( docs [ 0 ] . content . iloc [ 3 ] ) == [ " D " , " $54.35 " , " $6345. " , " " ]
2022-02-24 14:58:36 +01:00
assert (
2022-03-29 13:53:35 +02:00
docs [ 0 ] . meta [ " preceding_context " ] == " specification. These proprietary technologies are not "
2022-02-24 14:58:36 +01:00
" standardized and their \n specification is published only on "
" Adobe ' s website. Many of them are also not \n supported by popular "
" third-party implementations of PDF. "
)
2022-03-29 13:53:35 +02:00
assert docs [ 0 ] . meta [ " following_context " ] == " "
2021-12-30 10:15:11 +01:00
2022-03-29 13:53:35 +02:00
assert docs [ 1 ] . content_type == " text "
assert docs [ 1 ] . content . startswith ( " A sample PDF file " )
assert docs [ 1 ] . content . endswith ( " Page 4 of Sample PDF \n … the page 3 is empty. " )
2022-06-24 09:55:09 +02:00
def test_id_hash_keys_from_pipeline_params ( ) :
doc_path = SAMPLES_PATH / " docs " / " doc_1.txt "
meta_1 = { " key " : " a " }
meta_2 = { " key " : " b " }
meta = [ meta_1 , meta_2 ]
converter = TextConverter ( )
output , _ = converter . run ( file_paths = [ doc_path , doc_path ] , meta = meta , id_hash_keys = [ " content " , " meta " ] )
documents = output [ " documents " ]
unique_ids = set ( d . id for d in documents )
assert len ( documents ) == 2
assert len ( unique_ids ) == 2