mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-05 19:47:45 +00:00
Add PDF text extraction (#109)
This commit is contained in:
parent
479fcb1ace
commit
ef9e4f4467
@ -1,5 +1,5 @@
|
||||
language: python
|
||||
sudo: false
|
||||
sudo: true
|
||||
cache: pip
|
||||
python:
|
||||
- "3.7"
|
||||
|
||||
@ -228,6 +228,11 @@ You will find the Swagger API documentation at http://127.0.0.1:80/docs
|
||||
.. image:: https://raw.githubusercontent.com/deepset-ai/haystack/master/docs/img/annotation_tool.png
|
||||
|
||||
|
||||
7. Development
|
||||
7. Indexing PDF files
|
||||
______________________
|
||||
|
||||
Haystack has a customizable PDF text extraction pipeline with cleaning functions for header, footers, and tables. It supports complex document layouts with multi-column text.
|
||||
|
||||
8. Development
|
||||
-------------------
|
||||
* Unit tests can be executed by running :code:`tox`.
|
||||
0
haystack/indexing/file_converters/__init__.py
Normal file
0
haystack/indexing/file_converters/__init__.py
Normal file
44
haystack/indexing/file_converters/base.py
Normal file
44
haystack/indexing/file_converters/base.py
Normal file
@ -0,0 +1,44 @@
|
||||
from abc import abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class BaseConverter:
|
||||
"""
|
||||
Base class for implementing file converts to transform input documents to text format for indexing in database.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
remove_numeric_tables: bool = None,
|
||||
remove_header_footer: bool = None,
|
||||
remove_whitespace: bool = None,
|
||||
remove_empty_lines: bool = None,
|
||||
valid_languages: [str] = None,
|
||||
):
|
||||
"""
|
||||
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for the reader model if it
|
||||
does not have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could possible candidate for searching answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
:param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
|
||||
for the longest common string. This heuristic uses exact matches and therefore
|
||||
works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
|
||||
or similar.
|
||||
:param remove_whitespace: strip whitespaces before or after each line in the text.
|
||||
:param remove_empty_lines: remove more than two empty lines in the text.
|
||||
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
"""
|
||||
self.remove_numeric_tables = remove_numeric_tables
|
||||
self.remove_header_footer = remove_header_footer
|
||||
self.remove_whitespace = remove_whitespace
|
||||
self.remove_empty_lines = remove_empty_lines
|
||||
self.valid_languages = valid_languages
|
||||
|
||||
@abstractmethod
|
||||
def extract_pages(self, file_path: Path) -> [str]:
|
||||
pass
|
||||
230
haystack/indexing/file_converters/pdftotext.py
Normal file
230
haystack/indexing/file_converters/pdftotext.py
Normal file
@ -0,0 +1,230 @@
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
from functools import partial, reduce
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
|
||||
import fitz
|
||||
import langdetect
|
||||
|
||||
from haystack.indexing.file_converters.base import BaseConverter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PDFToTextConverter(BaseConverter):
|
||||
def __init__(
|
||||
self,
|
||||
remove_numeric_tables: bool = False,
|
||||
remove_whitespace: bool = None,
|
||||
remove_empty_lines: bool = None,
|
||||
remove_header_footer: bool = None,
|
||||
valid_languages: [str] = None,
|
||||
):
|
||||
"""
|
||||
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for the reader model if it
|
||||
does not have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could possible candidate for searching answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
:param remove_whitespace: strip whitespaces before or after each line in the text.
|
||||
:param remove_empty_lines: remove more than two empty lines in the text.
|
||||
:param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
|
||||
for the longest common string. This heuristic uses exact matches and therefore
|
||||
works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
|
||||
or similar.
|
||||
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
"""
|
||||
verify_installation = subprocess.run(["pdftotext -v"], shell=True)
|
||||
if verify_installation.returncode == 127:
|
||||
raise Exception(
|
||||
"""pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
|
||||
|
||||
Installation on Linux:
|
||||
wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.02.tar.gz &&
|
||||
tar -xvf xpdf-tools-linux-4.02.tar.gz && sudo cp xpdf-tools-linux-4.02/bin64/pdftotext /usr/local/bin
|
||||
|
||||
Installation on MacOS:
|
||||
brew install xpdf
|
||||
|
||||
You can find more details here: https://www.xpdfreader.com
|
||||
"""
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
remove_numeric_tables=remove_numeric_tables,
|
||||
remove_whitespace=remove_whitespace,
|
||||
remove_empty_lines=remove_empty_lines,
|
||||
remove_header_footer=remove_header_footer,
|
||||
valid_languages=valid_languages,
|
||||
)
|
||||
|
||||
def extract_pages(self, file_path: Path) -> [str]:
|
||||
|
||||
page_count = fitz.open(file_path).pageCount
|
||||
|
||||
pages = []
|
||||
for page_number in range(1, page_count + 1):
|
||||
# pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour
|
||||
# can be toggled by using the layout param.
|
||||
# layout=True
|
||||
# + table structures get retained better
|
||||
# - multi-column pages(eg, research papers) gets extracted with text from multiple columns on same line
|
||||
# layout=False
|
||||
# + keeps strings in content stream order, hence multi column layout works well
|
||||
# - cells of tables gets split across line
|
||||
#
|
||||
# Here, as a "safe" default, layout is turned off.
|
||||
page = self._extract_page(file_path, page_number, layout=False)
|
||||
lines = page.splitlines()
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
words = line.split()
|
||||
digits = [word for word in words if any(i.isdigit() for i in word)]
|
||||
|
||||
# remove lines having > 40% of words as digits AND not ending with a period(.)
|
||||
if self.remove_numeric_tables:
|
||||
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
|
||||
logger.debug(f"Removing line '{line}' from {file_path}")
|
||||
continue
|
||||
|
||||
if self.remove_whitespace:
|
||||
line = line.strip()
|
||||
|
||||
cleaned_lines.append(line)
|
||||
|
||||
page = "\n".join(cleaned_lines)
|
||||
|
||||
if self.remove_empty_lines:
|
||||
page = re.sub(r"\n\n+", "\n\n", page)
|
||||
|
||||
pages.append(page)
|
||||
page_number += 1
|
||||
|
||||
if self.valid_languages:
|
||||
document_text = "".join(pages)
|
||||
if not self._validate_language(document_text):
|
||||
logger.warning(
|
||||
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
|
||||
f"been decoded in the correct text format."
|
||||
)
|
||||
|
||||
if self.remove_header_footer:
|
||||
pages, header, footer = self.find_and_remove_header_footer(
|
||||
pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
|
||||
)
|
||||
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
|
||||
|
||||
return pages
|
||||
|
||||
def _extract_page(self, file_path: Path, page_number: int, layout: bool):
|
||||
"""
|
||||
Extract a page from the pdf file at file_path.
|
||||
|
||||
:param file_path: path of the pdf file
|
||||
:param page_number: page number to extract(starting from 1)
|
||||
:param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in
|
||||
the content stream order.
|
||||
"""
|
||||
if layout:
|
||||
command = ["pdftotext", "-layout", "-f", str(page_number), "-l", str(page_number), file_path, "-"]
|
||||
else:
|
||||
command = ["pdftotext", "-f", str(page_number), "-l", str(page_number), file_path, "-"]
|
||||
output_page = subprocess.run(command, capture_output=True, shell=False)
|
||||
page = output_page.stdout.decode(errors="ignore")
|
||||
return page
|
||||
|
||||
def _validate_language(self, text: str):
|
||||
"""
|
||||
Validate if the language of the text is one of valid languages.
|
||||
"""
|
||||
try:
|
||||
lang = langdetect.detect(text)
|
||||
except langdetect.lang_detect_exception.LangDetectException:
|
||||
lang = None
|
||||
|
||||
if lang in self.valid_languages:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def _ngram(self, seq: str, n: int):
|
||||
"""
|
||||
Return ngram (of tokens - currently splitted by whitespace)
|
||||
:param seq: str, string from which the ngram shall be created
|
||||
:param n: int, n of ngram
|
||||
:return: str, ngram as string
|
||||
"""
|
||||
|
||||
# In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
|
||||
# we add a space here and remove it after creation of the ngrams again (see below)
|
||||
seq = seq.replace("\n", " \n")
|
||||
seq = seq.replace("\t", " \t")
|
||||
|
||||
seq = seq.split(" ")
|
||||
ngrams = (
|
||||
" ".join(seq[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(seq) - n + 1)
|
||||
)
|
||||
|
||||
return ngrams
|
||||
|
||||
def _allngram(self, seq: str, min_ngram: int, max_ngram: int):
|
||||
lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
|
||||
ngrams = map(partial(self._ngram, seq), lengths)
|
||||
res = set(chain.from_iterable(ngrams))
|
||||
return res
|
||||
|
||||
def find_longest_common_ngram(self, sequences: [str], max_ngram: int = 30, min_ngram: int = 3):
|
||||
"""
|
||||
Find the longest common ngram across different text sequences (e.g. start of pages).
|
||||
Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
|
||||
|
||||
:param sequences: list[str], list of strings that shall be searched for common n_grams
|
||||
:param max_ngram: int, maximum length of ngram to consider
|
||||
:param min_ngram: minimum length of ngram to consider
|
||||
:return: str, common string of all sections
|
||||
"""
|
||||
|
||||
seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
|
||||
intersection = reduce(set.intersection, seqs_ngrams)
|
||||
|
||||
try:
|
||||
longest = max(intersection, key=len)
|
||||
except ValueError:
|
||||
# no common sequence found
|
||||
longest = ""
|
||||
return longest if longest.strip() else None
|
||||
|
||||
def find_and_remove_header_footer(
|
||||
self, pages: [str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
|
||||
):
|
||||
"""
|
||||
Heuristic to find footers and headers across different pages by searching for the longest common string.
|
||||
For headers we only search in the first n_chars characters (for footer: last n_chars).
|
||||
Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
|
||||
but won't detect "Page 3 of 4" or similar.
|
||||
|
||||
:param pages: list of strings, one string per page
|
||||
:param n_chars: number of first/last characters where the header/footer shall be searched in
|
||||
:param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
|
||||
:param n_last_pages_to_ignore: number of last pages to ignore
|
||||
:return: (cleaned pages, found_header_str, found_footer_str)
|
||||
"""
|
||||
|
||||
# header
|
||||
start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
|
||||
found_header = self.find_longest_common_ngram(start_of_pages)
|
||||
if found_header:
|
||||
pages = [page.replace(found_header, "") for page in pages]
|
||||
|
||||
# footer
|
||||
end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
|
||||
found_footer = self.find_longest_common_ngram(end_of_pages)
|
||||
if found_footer:
|
||||
pages = [page.replace(found_footer, "") for page in pages]
|
||||
return pages, found_header, found_footer
|
||||
@ -4,57 +4,53 @@ from farm.data_handler.utils import http_get
|
||||
import tempfile
|
||||
import tarfile
|
||||
import zipfile
|
||||
from typing import Callable
|
||||
from haystack.indexing.file_converters.pdftotext import PDFToTextConverter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def write_documents_to_db(document_store, document_dir, clean_func=None, only_empty_db=False, split_paragraphs=False):
|
||||
def convert_files_to_dicts(dir_path: str, clean_func: Callable = None, split_paragraphs: bool = False) -> [dict]:
|
||||
"""
|
||||
Write all text files(.txt) in the sub-directories of the given path to the connected database.
|
||||
Convert all files(.txt, .pdf) in the sub-directories of the given path to Python dicts that can be written to a
|
||||
Document Store.
|
||||
|
||||
:param document_dir: path for the documents to be written to the database
|
||||
:param dir_path: path for the documents to be written to the database
|
||||
:param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
|
||||
:param only_empty_db: If true, docs will only be written if db is completely empty.
|
||||
Useful to avoid indexing the same initial docs again and again.
|
||||
:param split_paragraphs: split text in paragraphs.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
file_paths = Path(document_dir).glob("**/*.txt")
|
||||
|
||||
# check if db has already docs
|
||||
if only_empty_db:
|
||||
n_docs = document_store.get_document_count()
|
||||
if n_docs > 0:
|
||||
logger.info(f"Skip writing documents since DB already contains {n_docs} docs ... "
|
||||
"(Disable `only_empty_db`, if you want to add docs anyway.)")
|
||||
return None
|
||||
file_paths = [p for p in Path(dir_path).glob("**/*")]
|
||||
if ".pdf" in [p.suffix.lower() for p in file_paths]:
|
||||
pdf_converter = PDFToTextConverter()
|
||||
else:
|
||||
pdf_converter = None
|
||||
|
||||
# read and add docs
|
||||
docs_to_index = []
|
||||
documents = []
|
||||
for path in file_paths:
|
||||
with open(path) as doc:
|
||||
text = doc.read()
|
||||
if clean_func:
|
||||
text = clean_func(text)
|
||||
if path.suffix.lower() == ".txt":
|
||||
with open(path) as doc:
|
||||
text = doc.read()
|
||||
elif path.suffix.lower() == ".pdf":
|
||||
pages = pdf_converter.extract_pages(path)
|
||||
text = "\n".join(pages)
|
||||
else:
|
||||
raise Exception(f"Indexing of {path.suffix} files is not currently supported.")
|
||||
|
||||
if split_paragraphs:
|
||||
for para in text.split("\n\n"):
|
||||
if not para.strip(): # skip empty paragraphs
|
||||
continue
|
||||
docs_to_index.append(
|
||||
{
|
||||
"name": path.name,
|
||||
"text": para
|
||||
}
|
||||
)
|
||||
else:
|
||||
docs_to_index.append(
|
||||
{
|
||||
"name": path.name,
|
||||
"text": text
|
||||
}
|
||||
)
|
||||
document_store.write_documents(docs_to_index)
|
||||
logger.info(f"Wrote {len(docs_to_index)} docs to DB")
|
||||
if clean_func:
|
||||
text = clean_func(text)
|
||||
|
||||
if split_paragraphs:
|
||||
for para in text.split("\n\n"):
|
||||
if not para.strip(): # skip empty paragraphs
|
||||
continue
|
||||
documents.append({"name": path.name, "text": para})
|
||||
else:
|
||||
documents.append({"name": path.name, "text": text})
|
||||
|
||||
return documents
|
||||
|
||||
|
||||
def fetch_archive_from_http(url, output_dir, proxies=None):
|
||||
@ -97,3 +93,4 @@ def fetch_archive_from_http(url, output_dir, proxies=None):
|
||||
archive.extractall(output_dir)
|
||||
# temp_file gets deleted here
|
||||
return True
|
||||
|
||||
@ -9,4 +9,6 @@ elasticsearch
|
||||
elastic-apm
|
||||
tox
|
||||
coverage
|
||||
langdetect # for PDF conversions
|
||||
PyMuPDF # for PDF conversions
|
||||
# optional: sentence-transformers
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
import tarfile
|
||||
import time
|
||||
import urllib.request
|
||||
from subprocess import Popen, PIPE, STDOUT
|
||||
|
||||
from subprocess import Popen, PIPE, STDOUT, run
|
||||
|
||||
import pytest
|
||||
|
||||
@ -19,3 +20,19 @@ def elasticsearch_fixture(elasticsearch_dir):
|
||||
thetarfile.extractall(path=elasticsearch_dir)
|
||||
es_server = Popen([elasticsearch_dir / "elasticsearch-7.6.1/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT)
|
||||
time.sleep(30)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def xpdf_fixture():
|
||||
verify_installation = run(["pdftotext"], shell=True)
|
||||
if verify_installation.returncode == 127:
|
||||
commands = """ wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.02.tar.gz &&
|
||||
tar -xvf xpdf-tools-linux-4.02.tar.gz && sudo cp xpdf-tools-linux-4.02/bin64/pdftotext /usr/local/bin"""
|
||||
run([commands], shell=True)
|
||||
|
||||
verify_installation = run(["pdftotext -v"], shell=True)
|
||||
if verify_installation.returncode == 127:
|
||||
raise Exception(
|
||||
"""pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
|
||||
You can download for your OS from here: https://www.xpdfreader.com/download.html."""
|
||||
)
|
||||
|
||||
BIN
test/samples/pdf/sample_pdf_1.pdf
Normal file
BIN
test/samples/pdf/sample_pdf_1.pdf
Normal file
Binary file not shown.
BIN
test/samples/pdf/sample_pdf_2.pdf
Normal file
BIN
test/samples/pdf/sample_pdf_2.pdf
Normal file
Binary file not shown.
@ -2,12 +2,13 @@ from time import sleep
|
||||
|
||||
from haystack.database.elasticsearch import ElasticsearchDocumentStore
|
||||
from haystack.database.sql import SQLDocumentStore
|
||||
from haystack.indexing.io import write_documents_to_db
|
||||
from haystack.indexing.utils import convert_files_to_dicts
|
||||
|
||||
|
||||
def test_sql_write_read():
|
||||
sql_document_store = SQLDocumentStore()
|
||||
write_documents_to_db(document_store=sql_document_store, document_dir="samples/docs")
|
||||
documents = convert_files_to_dicts(dir_path="samples/docs")
|
||||
sql_document_store.write_documents(documents)
|
||||
documents = sql_document_store.get_all_documents()
|
||||
assert len(documents) == 2
|
||||
doc = sql_document_store.get_document_by_id("1")
|
||||
@ -17,7 +18,8 @@ def test_sql_write_read():
|
||||
|
||||
def test_elasticsearch_write_read(elasticsearch_fixture):
|
||||
document_store = ElasticsearchDocumentStore()
|
||||
write_documents_to_db(document_store=document_store, document_dir="samples/docs")
|
||||
documents = convert_files_to_dicts(dir_path="samples/docs")
|
||||
document_store.write_documents(documents)
|
||||
sleep(2) # wait for documents to be available for query
|
||||
documents = document_store.get_all_documents()
|
||||
assert len(documents) == 2
|
||||
|
||||
@ -2,7 +2,7 @@ def test_module_imports():
|
||||
from haystack import Finder
|
||||
from haystack.database.sql import SQLDocumentStore
|
||||
from haystack.indexing.cleaning import clean_wiki_text
|
||||
from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
|
||||
from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
|
||||
from haystack.reader.farm import FARMReader
|
||||
from haystack.reader.transformers import TransformersReader
|
||||
from haystack.retriever.tfidf import TfidfRetriever
|
||||
@ -11,7 +11,7 @@ def test_module_imports():
|
||||
assert Finder is not None
|
||||
assert SQLDocumentStore is not None
|
||||
assert clean_wiki_text is not None
|
||||
assert write_documents_to_db is not None
|
||||
assert convert_files_to_dicts is not None
|
||||
assert fetch_archive_from_http is not None
|
||||
assert FARMReader is not None
|
||||
assert TransformersReader is not None
|
||||
|
||||
52
test/test_pdf_conversion.py
Normal file
52
test/test_pdf_conversion.py
Normal file
@ -0,0 +1,52 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from haystack.indexing.file_converters.pdftotext import PDFToTextConverter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def test_extract_pages(xpdf_fixture):
|
||||
converter = PDFToTextConverter()
|
||||
pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
assert len(pages) == 4 # the sample PDF file has four pages.
|
||||
assert pages[0] != "" # the page 1 of PDF contains text.
|
||||
assert pages[2] == "" # the page 3 of PDF file is empty.
|
||||
|
||||
|
||||
def test_table_removal(xpdf_fixture):
|
||||
converter = PDFToTextConverter(remove_numeric_tables=True)
|
||||
pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
|
||||
# assert numeric rows are removed from the table.
|
||||
assert "324" not in pages[0]
|
||||
assert "54x growth" not in pages[0]
|
||||
assert "$54.35" not in pages[0]
|
||||
|
||||
# assert text is retained from the document.
|
||||
assert "Adobe Systems made the PDF specification available free of charge in 1993." in pages[0]
|
||||
|
||||
|
||||
def test_language_validation(xpdf_fixture, caplog):
|
||||
converter = PDFToTextConverter(valid_languages=["en"])
|
||||
pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
|
||||
|
||||
converter = PDFToTextConverter(valid_languages=["de"])
|
||||
pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
|
||||
|
||||
|
||||
def test_header_footer_removal(xpdf_fixture):
|
||||
converter = PDFToTextConverter(remove_header_footer=True)
|
||||
converter_no_removal = PDFToTextConverter(remove_header_footer=False)
|
||||
|
||||
pages1 = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
|
||||
pages2 = converter_no_removal.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
|
||||
for p1, p2 in zip(pages1, pages2):
|
||||
assert p2 == p2
|
||||
|
||||
pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header and footer
|
||||
for page in pages:
|
||||
assert "header" not in page
|
||||
assert "footer" not in page
|
||||
@ -42,7 +42,7 @@
|
||||
"source": [
|
||||
"from haystack import Finder\n",
|
||||
"from haystack.indexing.cleaning import clean_wiki_text\n",
|
||||
"from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http\n",
|
||||
"from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http\n",
|
||||
"from haystack.reader.farm import FARMReader\n",
|
||||
"from haystack.reader.transformers import TransformersReader\n",
|
||||
"from haystack.utils import print_answers"
|
||||
@ -164,11 +164,13 @@
|
||||
"s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip\"\n",
|
||||
"fetch_archive_from_http(url=s3_url, output_dir=doc_dir)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Now, let's write the docs to our DB.\n",
|
||||
"# Convert files to dicts\n",
|
||||
"# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)\n",
|
||||
"# It must take a str as input, and return a str.\n",
|
||||
"write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True)"
|
||||
"dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)\n",
|
||||
"\n",
|
||||
"# Now, let's write the dicts containing documents to our DB.\n",
|
||||
"document_store.write_documents(dicts)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@ -16,7 +16,7 @@ import time
|
||||
from haystack import Finder
|
||||
from haystack.database.elasticsearch import ElasticsearchDocumentStore
|
||||
from haystack.indexing.cleaning import clean_wiki_text
|
||||
from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
|
||||
from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
|
||||
from haystack.reader.farm import FARMReader
|
||||
from haystack.reader.transformers import TransformersReader
|
||||
from haystack.utils import print_answers
|
||||
@ -69,10 +69,14 @@ doc_dir = "data/article_txt_got"
|
||||
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
|
||||
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
|
||||
|
||||
# Now, let's write the docs to our DB.
|
||||
# convert files to dicts containing documents that can be indexed to our datastore
|
||||
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
|
||||
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
|
||||
# It must take a str as input, and return a str.
|
||||
write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True)
|
||||
|
||||
# Now, let's write the docs to our DB.
|
||||
document_store.write_documents(dicts)
|
||||
|
||||
|
||||
# ## Initalize Retriever, Reader, & Finder
|
||||
#
|
||||
|
||||
@ -35,7 +35,7 @@
|
||||
"source": [
|
||||
"from haystack import Finder\n",
|
||||
"from haystack.indexing.cleaning import clean_wiki_text\n",
|
||||
"from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http\n",
|
||||
"from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http\n",
|
||||
"from haystack.reader.farm import FARMReader\n",
|
||||
"from haystack.reader.transformers import TransformersReader\n",
|
||||
"from haystack.utils import print_answers"
|
||||
@ -110,11 +110,13 @@
|
||||
"s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip\"\n",
|
||||
"fetch_archive_from_http(url=s3_url, output_dir=doc_dir)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Now, let's write the docs to our DB.\n",
|
||||
"# convert files to dicts containing documents that can be indexed to our datastore\n",
|
||||
"# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)\n",
|
||||
"# It must take a str as input, and return a str.\n",
|
||||
"write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True)"
|
||||
"dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)\n",
|
||||
"\n",
|
||||
"# Now, let's write the docs to our DB.\n",
|
||||
"document_store.write_documents(dicts)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@ -11,7 +11,7 @@ from haystack import Finder
|
||||
from haystack.database.memory import InMemoryDocumentStore
|
||||
from haystack.database.sql import SQLDocumentStore
|
||||
from haystack.indexing.cleaning import clean_wiki_text
|
||||
from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
|
||||
from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
|
||||
from haystack.reader.farm import FARMReader
|
||||
from haystack.reader.transformers import TransformersReader
|
||||
from haystack.retriever.tfidf import TfidfRetriever
|
||||
@ -37,11 +37,13 @@ doc_dir = "data/article_txt_got"
|
||||
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
|
||||
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
|
||||
|
||||
# Now, let's write the docs to our DB.
|
||||
# convert files to dicts containing documents that can be indexed to our datastore
|
||||
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
|
||||
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
|
||||
# It must take a str as input, and return a str.
|
||||
write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True)
|
||||
|
||||
# Now, let's write the docs to our DB.
|
||||
document_store.write_documents(dicts)
|
||||
|
||||
|
||||
# ## Initalize Retriever, Reader, & Finder
|
||||
|
||||
@ -1635,7 +1635,7 @@
|
||||
},
|
||||
"source": [
|
||||
"\n",
|
||||
"from haystack.indexing.io import fetch_archive_from_http\n",
|
||||
"from haystack.indexing.utils import fetch_archive_from_http\n",
|
||||
"\n",
|
||||
"# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents\n",
|
||||
"doc_dir = \"../data/nq\"\n",
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
from haystack.database.elasticsearch import ElasticsearchDocumentStore
|
||||
from haystack.indexing.io import fetch_archive_from_http
|
||||
from haystack.indexing.utils import fetch_archive_from_http
|
||||
from haystack.retriever.elasticsearch import ElasticsearchRetriever
|
||||
from haystack.reader.farm import FARMReader
|
||||
from haystack.finder import Finder
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user