Add API endpoint to upload files (#154)

This commit is contained in:
Tanay Soni 2020-06-17 16:28:26 +02:00 committed by GitHub
parent 8bcc4b26a1
commit a349eef0db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 288 additions and 102 deletions

View File

@ -41,6 +41,16 @@ if EXCLUDE_META_DATA_FIELDS:
EMBEDDING_MODEL_PATH = os.getenv("EMBEDDING_MODEL_PATH", None)
EMBEDDING_MODEL_FORMAT = os.getenv("EMBEDDING_MODEL_FORMAT", "farm")
# File uploads
FILE_UPLOAD_PATH = os.getenv("FILE_UPLOAD_PATH", "file-uploads")
REMOVE_NUMERIC_TABLES = os.getenv("REMOVE_NUMERIC_TABLES", "True").lower() == "true"
REMOVE_WHITESPACE = os.getenv("REMOVE_WHITESPACE", "True").lower() == "true"
REMOVE_EMPTY_LINES = os.getenv("REMOVE_EMPTY_LINES", "True").lower() == "true"
REMOVE_HEADER_FOOTER = os.getenv("REMOVE_HEADER_FOOTER", "True").lower() == "true"
VALID_LANGUAGES = os.getenv("VALID_LANGUAGES", None)
if VALID_LANGUAGES:
VALID_LANGUAGES = ast.literal_eval(VALID_LANGUAGES)
# Monitoring
APM_SERVER = os.getenv("APM_SERVER", None)
APM_SERVICE_NAME = os.getenv("APM_SERVICE_NAME", "haystack-backend")

View File

@ -0,0 +1,80 @@
import logging
import shutil
import uuid
from pathlib import Path
from typing import Optional, List
from fastapi import APIRouter
from fastapi import HTTPException
from fastapi import UploadFile, File, Form
from haystack.api.config import DB_HOST, DB_PORT, DB_USER, DB_PW, DB_INDEX, ES_CONN_SCHEME, TEXT_FIELD_NAME, \
SEARCH_FIELD_NAME, FILE_UPLOAD_PATH, EMBEDDING_DIM, EMBEDDING_FIELD_NAME, EXCLUDE_META_DATA_FIELDS, VALID_LANGUAGES, \
FAQ_QUESTION_FIELD_NAME, REMOVE_NUMERIC_TABLES, REMOVE_WHITESPACE, REMOVE_EMPTY_LINES, REMOVE_HEADER_FOOTER
from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.indexing.file_converters.pdf import PDFToTextConverter
from haystack.indexing.file_converters.txt import TextConverter
logger = logging.getLogger(__name__)
router = APIRouter()
document_store = ElasticsearchDocumentStore(
host=DB_HOST,
port=DB_PORT,
username=DB_USER,
password=DB_PW,
index=DB_INDEX,
scheme=ES_CONN_SCHEME,
ca_certs=False,
verify_certs=False,
text_field=TEXT_FIELD_NAME,
search_fields=SEARCH_FIELD_NAME,
embedding_dim=EMBEDDING_DIM,
embedding_field=EMBEDDING_FIELD_NAME,
excluded_meta_data=EXCLUDE_META_DATA_FIELDS, # type: ignore
faq_question_field=FAQ_QUESTION_FIELD_NAME,
)
@router.post("/file-upload")
def upload_file_to_document_store(
file: UploadFile = File(...),
remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES),
remove_whitespace: Optional[bool] = Form(REMOVE_WHITESPACE),
remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES),
remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER),
valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES),
) -> None:
try:
file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
with file_path.open("wb") as buffer:
shutil.copyfileobj(file.file, buffer)
if file.filename.split(".")[-1].lower() == "pdf":
pdf_converter = PDFToTextConverter(
remove_numeric_tables=remove_numeric_tables,
remove_whitespace=remove_whitespace,
remove_empty_lines=remove_empty_lines,
remove_header_footer=remove_header_footer,
valid_languages=valid_languages,
)
pages = pdf_converter.extract_pages(file_path)
elif file.filename.split(".")[-1].lower() == "txt":
txt_converter = TextConverter(
remove_numeric_tables=remove_numeric_tables,
remove_whitespace=remove_whitespace,
remove_empty_lines=remove_empty_lines,
remove_header_footer=remove_header_footer,
valid_languages=valid_languages,
)
pages = txt_converter.extract_pages(file_path)
else:
raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")
document = {TEXT_FIELD_NAME: "\n".join(pages), "name": file.filename}
document_store.write_documents([document])
finally:
file.file.close()

View File

@ -1,8 +1,9 @@
from fastapi import APIRouter
from haystack.api.controller import search, feedback
from haystack.api.controller import search, feedback, file_upload
router = APIRouter()
router.include_router(search.router, tags=["search"])
router.include_router(feedback.router, tags=["feedback"])
router.include_router(file_upload.router, tags=["file-upload"])

View File

@ -1,6 +1,10 @@
from abc import abstractmethod
from functools import partial, reduce
from itertools import chain
from pathlib import Path
from typing import List, Optional
from typing import List, Optional, Tuple, Generator, Set
import langdetect
class BaseConverter:
@ -43,3 +47,97 @@ class BaseConverter:
@abstractmethod
def extract_pages(self, file_path: Path) -> List[str]:
pass
def validate_language(self, text: str) -> bool:
"""
Validate if the language of the text is one of valid languages.
"""
if not self.valid_languages:
return True
try:
lang = langdetect.detect(text)
except langdetect.lang_detect_exception.LangDetectException:
lang = None
if lang in self.valid_languages:
return True
else:
return False
def find_and_remove_header_footer(
self, pages: List[str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
) -> Tuple[List[str], Optional[str], Optional[str]]:
"""
Heuristic to find footers and headers across different pages by searching for the longest common string.
For headers we only search in the first n_chars characters (for footer: last n_chars).
Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
but won't detect "Page 3 of 4" or similar.
:param pages: list of strings, one string per page
:param n_chars: number of first/last characters where the header/footer shall be searched in
:param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
:param n_last_pages_to_ignore: number of last pages to ignore
:return: (cleaned pages, found_header_str, found_footer_str)
"""
# header
start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
found_header = self._find_longest_common_ngram(start_of_pages)
if found_header:
pages = [page.replace(found_header, "") for page in pages]
# footer
end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
found_footer = self._find_longest_common_ngram(end_of_pages)
if found_footer:
pages = [page.replace(found_footer, "") for page in pages]
return pages, found_header, found_footer
def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
"""
Return ngram (of tokens - currently splitted by whitespace)
:param seq: str, string from which the ngram shall be created
:param n: int, n of ngram
:return: str, ngram as string
"""
# In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
# we add a space here and remove it after creation of the ngrams again (see below)
seq = seq.replace("\n", " \n")
seq = seq.replace("\t", " \t")
words = seq.split(" ")
ngrams = (
" ".join(words[i: i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
)
return ngrams
def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
ngrams = map(partial(self._ngram, seq), lengths)
res = set(chain.from_iterable(ngrams))
return res
def _find_longest_common_ngram(self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3) -> Optional[str]:
"""
Find the longest common ngram across different text sequences (e.g. start of pages).
Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
:param sequences: list[str], list of strings that shall be searched for common n_grams
:param max_ngram: int, maximum length of ngram to consider
:param min_ngram: minimum length of ngram to consider
:return: str, common string of all sections
"""
seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
intersection = reduce(set.intersection, seqs_ngrams)
try:
longest = max(intersection, key=len)
except ValueError:
# no common sequence found
longest = ""
return longest if longest.strip() else None

View File

@ -1,12 +1,8 @@
import logging
import re
import subprocess
from functools import partial, reduce
from itertools import chain
from pathlib import Path
from typing import List, Optional, Tuple, Generator, Set
import langdetect
from typing import List, Optional
from haystack.indexing.file_converters.base import BaseConverter
@ -106,7 +102,7 @@ class PDFToTextConverter(BaseConverter):
if self.valid_languages:
document_text = "".join(cleaned_pages)
if not self._validate_language(document_text):
if not self.validate_language(document_text):
logger.warning(
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
f"been decoded in the correct text format."
@ -138,95 +134,3 @@ class PDFToTextConverter(BaseConverter):
pages = pages[:-1] # the last page in the split is always empty.
return pages
def _validate_language(self, text: str) -> bool:
"""
Validate if the language of the text is one of valid languages.
"""
if not self.valid_languages:
return True
try:
lang = langdetect.detect(text)
except langdetect.lang_detect_exception.LangDetectException:
lang = None
if lang in self.valid_languages:
return True
else:
return False
def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
"""
Return ngram (of tokens - currently splitted by whitespace)
:param seq: str, string from which the ngram shall be created
:param n: int, n of ngram
:return: str, ngram as string
"""
# In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
# we add a space here and remove it after creation of the ngrams again (see below)
seq = seq.replace("\n", " \n")
seq = seq.replace("\t", " \t")
words = seq.split(" ")
ngrams = (
" ".join(words[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
)
return ngrams
def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
ngrams = map(partial(self._ngram, seq), lengths)
res = set(chain.from_iterable(ngrams))
return res
def find_longest_common_ngram(self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3) -> Optional[str]:
"""
Find the longest common ngram across different text sequences (e.g. start of pages).
Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
:param sequences: list[str], list of strings that shall be searched for common n_grams
:param max_ngram: int, maximum length of ngram to consider
:param min_ngram: minimum length of ngram to consider
:return: str, common string of all sections
"""
seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
intersection = reduce(set.intersection, seqs_ngrams)
try:
longest = max(intersection, key=len)
except ValueError:
# no common sequence found
longest = ""
return longest if longest.strip() else None
def find_and_remove_header_footer(
self, pages: List[str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
) -> Tuple[List[str], Optional[str], Optional[str]]:
"""
Heuristic to find footers and headers across different pages by searching for the longest common string.
For headers we only search in the first n_chars characters (for footer: last n_chars).
Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
but won't detect "Page 3 of 4" or similar.
:param pages: list of strings, one string per page
:param n_chars: number of first/last characters where the header/footer shall be searched in
:param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
:param n_last_pages_to_ignore: number of last pages to ignore
:return: (cleaned pages, found_header_str, found_footer_str)
"""
# header
start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
found_header = self.find_longest_common_ngram(start_of_pages)
if found_header:
pages = [page.replace(found_header, "") for page in pages]
# footer
end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
found_footer = self.find_longest_common_ngram(end_of_pages)
if found_footer:
pages = [page.replace(found_footer, "") for page in pages]
return pages, found_header, found_footer

View File

@ -0,0 +1,93 @@
import logging
import re
from pathlib import Path
from typing import List, Optional
from haystack.indexing.file_converters.base import BaseConverter
logger = logging.getLogger(__name__)
class TextConverter(BaseConverter):
def __init__(
self,
remove_numeric_tables: Optional[bool] = False,
remove_whitespace: Optional[bool] = None,
remove_empty_lines: Optional[bool] = None,
remove_header_footer: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param remove_whitespace: strip whitespaces before or after each line in the text.
:param remove_empty_lines: remove more than two empty lines in the text.
:param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
for the longest common string. This heuristic uses exact matches and therefore
works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
or similar.
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
"""
super().__init__(
remove_numeric_tables=remove_numeric_tables,
remove_whitespace=remove_whitespace,
remove_empty_lines=remove_empty_lines,
remove_header_footer=remove_header_footer,
valid_languages=valid_languages,
)
def extract_pages(self, file_path: Path) -> List[str]:
with open(file_path) as f:
text = f.read()
pages = text.split("\f")
cleaned_pages = []
for page in pages:
lines = page.splitlines()
cleaned_lines = []
for line in lines:
words = line.split()
digits = [word for word in words if any(i.isdigit() for i in word)]
# remove lines having > 40% of words as digits AND not ending with a period(.)
if self.remove_numeric_tables:
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
logger.debug(f"Removing line '{line}' from {file_path}")
continue
if self.remove_whitespace:
line = line.strip()
cleaned_lines.append(line)
page = "\n".join(cleaned_lines)
if self.remove_empty_lines:
page = re.sub(r"\n\n+", "\n\n", page)
cleaned_pages.append(page)
if self.valid_languages:
document_text = "".join(cleaned_pages)
if not self.validate_language(document_text):
logger.warning(
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
f"been decoded in the correct text format."
)
if self.remove_header_footer:
cleaned_pages, header, footer = self.find_and_remove_header_footer(
cleaned_pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
)
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
return cleaned_pages

View File

@ -7,7 +7,7 @@ from typing import Callable, List, Optional
from farm.data_handler.utils import http_get
from haystack.indexing.file_converters.pdftotext import PDFToTextConverter
from haystack.indexing.file_converters.pdf import PDFToTextConverter
logger = logging.getLogger(__name__)

View File

@ -1,7 +1,7 @@
import logging
from pathlib import Path
from haystack.indexing.file_converters.pdftotext import PDFToTextConverter
from haystack.indexing.file_converters.pdf import PDFToTextConverter
logger = logging.getLogger(__name__)