haystack/test/test_pdf_conversion.py

52 lines
2.2 KiB
Python
Raw Normal View History

import logging
2020-06-08 11:07:19 +02:00
from pathlib import Path
2020-08-14 14:13:59 +02:00
from haystack.indexing.file_converters.pdf import PDFToTextConverter
logger = logging.getLogger(__name__)
2020-06-08 11:07:19 +02:00
def test_extract_pages(xpdf_fixture):
converter = PDFToTextConverter()
pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
2020-06-08 11:07:19 +02:00
assert len(pages) == 4 # the sample PDF file has four pages.
assert pages[0] != "" # the page 1 of PDF contains text.
assert pages[2] == "" # the page 3 of PDF file is empty.
def test_table_removal(xpdf_fixture):
converter = PDFToTextConverter(remove_numeric_tables=True)
pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
2020-06-08 11:07:19 +02:00
# assert numeric rows are removed from the table.
assert "324" not in pages[0]
assert "54x growth" not in pages[0]
assert "$54.35" not in pages[0]
2020-06-08 11:07:19 +02:00
# assert text is retained from the document.
assert "Adobe Systems made the PDF specification available free of charge in 1993." in pages[0]
2020-06-08 11:07:19 +02:00
def test_language_validation(xpdf_fixture, caplog):
converter = PDFToTextConverter(valid_languages=["en"])
pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
2020-06-08 11:07:19 +02:00
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
converter = PDFToTextConverter(valid_languages=["de"])
pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
2020-06-08 11:07:19 +02:00
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
def test_header_footer_removal(xpdf_fixture):
converter = PDFToTextConverter(remove_header_footer=True)
converter_no_removal = PDFToTextConverter(remove_header_footer=False)
2020-06-08 11:07:19 +02:00
pages1 = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
pages2 = converter_no_removal.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
2020-06-08 11:07:19 +02:00
for p1, p2 in zip(pages1, pages2):
assert p2 == p2
pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header and footer
2020-06-08 11:07:19 +02:00
for page in pages:
assert "header" not in page
2020-06-08 11:07:19 +02:00
assert "footer" not in page