haystack/test/test_pdf_conversion.py

56 lines
2.5 KiB
Python
Raw Normal View History

2020-06-08 11:07:19 +02:00
from pathlib import Path
2020-08-17 11:21:09 +02:00
import pytest
2020-08-17 11:21:09 +02:00
from haystack.indexing.file_converters.pdf import PDFToTextConverter
from haystack.indexing.file_converters.tika import TikaConverter
2020-06-08 11:07:19 +02:00
2020-08-17 11:21:09 +02:00
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_extract_pages(Converter, xpdf_fixture):
converter = Converter()
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
2020-06-08 11:07:19 +02:00
assert len(pages) == 4 # the sample PDF file has four pages.
assert pages[0] != "" # the page 1 of PDF contains text.
assert pages[2] == "" # the page 3 of PDF file is empty.
2020-08-17 11:21:09 +02:00
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_table_removal(Converter, xpdf_fixture):
converter = Converter(remove_numeric_tables=True)
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
2020-06-08 11:07:19 +02:00
# assert numeric rows are removed from the table.
assert "324" not in pages[0]
assert "54x growth" not in pages[0]
# assert text is retained from the document.
2020-08-17 11:21:09 +02:00
assert "Adobe Systems made the PDF specification available free of charge in 1993." in pages[0].replace("\n", "")
2020-06-08 11:07:19 +02:00
2020-08-17 11:21:09 +02:00
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_language_validation(Converter, xpdf_fixture, caplog):
converter = Converter(valid_languages=["en"])
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
2020-06-08 11:07:19 +02:00
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
2020-08-17 11:21:09 +02:00
converter = Converter(valid_languages=["de"])
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
2020-06-08 11:07:19 +02:00
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
2020-08-17 11:21:09 +02:00
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_header_footer_removal(Converter, xpdf_fixture):
converter = Converter(remove_header_footer=True)
converter_no_removal = Converter(remove_header_footer=False)
2020-06-08 11:07:19 +02:00
2020-08-17 11:21:09 +02:00
pages1, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
pages2, _ = converter_no_removal.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
2020-06-08 11:07:19 +02:00
for p1, p2 in zip(pages1, pages2):
assert p2 == p2
2020-08-17 11:21:09 +02:00
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header and footer
assert len(pages) == 4
2020-06-08 11:07:19 +02:00
for page in pages:
2020-08-17 11:21:09 +02:00
assert "This is a header." not in page
2020-06-08 11:07:19 +02:00
assert "footer" not in page