mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-28 02:16:32 +00:00
Add API endpoint to upload files (#154)
This commit is contained in:
parent
8bcc4b26a1
commit
a349eef0db
@ -41,6 +41,16 @@ if EXCLUDE_META_DATA_FIELDS:
|
||||
EMBEDDING_MODEL_PATH = os.getenv("EMBEDDING_MODEL_PATH", None)
|
||||
EMBEDDING_MODEL_FORMAT = os.getenv("EMBEDDING_MODEL_FORMAT", "farm")
|
||||
|
||||
# File uploads
|
||||
FILE_UPLOAD_PATH = os.getenv("FILE_UPLOAD_PATH", "file-uploads")
|
||||
REMOVE_NUMERIC_TABLES = os.getenv("REMOVE_NUMERIC_TABLES", "True").lower() == "true"
|
||||
REMOVE_WHITESPACE = os.getenv("REMOVE_WHITESPACE", "True").lower() == "true"
|
||||
REMOVE_EMPTY_LINES = os.getenv("REMOVE_EMPTY_LINES", "True").lower() == "true"
|
||||
REMOVE_HEADER_FOOTER = os.getenv("REMOVE_HEADER_FOOTER", "True").lower() == "true"
|
||||
VALID_LANGUAGES = os.getenv("VALID_LANGUAGES", None)
|
||||
if VALID_LANGUAGES:
|
||||
VALID_LANGUAGES = ast.literal_eval(VALID_LANGUAGES)
|
||||
|
||||
# Monitoring
|
||||
APM_SERVER = os.getenv("APM_SERVER", None)
|
||||
APM_SERVICE_NAME = os.getenv("APM_SERVICE_NAME", "haystack-backend")
|
||||
|
80
haystack/api/controller/file_upload.py
Normal file
80
haystack/api/controller/file_upload.py
Normal file
@ -0,0 +1,80 @@
|
||||
import logging
|
||||
import shutil
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Optional, List
|
||||
|
||||
from fastapi import APIRouter
|
||||
from fastapi import HTTPException
|
||||
from fastapi import UploadFile, File, Form
|
||||
|
||||
from haystack.api.config import DB_HOST, DB_PORT, DB_USER, DB_PW, DB_INDEX, ES_CONN_SCHEME, TEXT_FIELD_NAME, \
|
||||
SEARCH_FIELD_NAME, FILE_UPLOAD_PATH, EMBEDDING_DIM, EMBEDDING_FIELD_NAME, EXCLUDE_META_DATA_FIELDS, VALID_LANGUAGES, \
|
||||
FAQ_QUESTION_FIELD_NAME, REMOVE_NUMERIC_TABLES, REMOVE_WHITESPACE, REMOVE_EMPTY_LINES, REMOVE_HEADER_FOOTER
|
||||
from haystack.database.elasticsearch import ElasticsearchDocumentStore
|
||||
from haystack.indexing.file_converters.pdf import PDFToTextConverter
|
||||
from haystack.indexing.file_converters.txt import TextConverter
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
document_store = ElasticsearchDocumentStore(
|
||||
host=DB_HOST,
|
||||
port=DB_PORT,
|
||||
username=DB_USER,
|
||||
password=DB_PW,
|
||||
index=DB_INDEX,
|
||||
scheme=ES_CONN_SCHEME,
|
||||
ca_certs=False,
|
||||
verify_certs=False,
|
||||
text_field=TEXT_FIELD_NAME,
|
||||
search_fields=SEARCH_FIELD_NAME,
|
||||
embedding_dim=EMBEDDING_DIM,
|
||||
embedding_field=EMBEDDING_FIELD_NAME,
|
||||
excluded_meta_data=EXCLUDE_META_DATA_FIELDS, # type: ignore
|
||||
faq_question_field=FAQ_QUESTION_FIELD_NAME,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/file-upload")
|
||||
def upload_file_to_document_store(
|
||||
file: UploadFile = File(...),
|
||||
remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES),
|
||||
remove_whitespace: Optional[bool] = Form(REMOVE_WHITESPACE),
|
||||
remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES),
|
||||
remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER),
|
||||
valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES),
|
||||
) -> None:
|
||||
try:
|
||||
file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
|
||||
with file_path.open("wb") as buffer:
|
||||
shutil.copyfileobj(file.file, buffer)
|
||||
|
||||
if file.filename.split(".")[-1].lower() == "pdf":
|
||||
pdf_converter = PDFToTextConverter(
|
||||
remove_numeric_tables=remove_numeric_tables,
|
||||
remove_whitespace=remove_whitespace,
|
||||
remove_empty_lines=remove_empty_lines,
|
||||
remove_header_footer=remove_header_footer,
|
||||
valid_languages=valid_languages,
|
||||
)
|
||||
pages = pdf_converter.extract_pages(file_path)
|
||||
elif file.filename.split(".")[-1].lower() == "txt":
|
||||
txt_converter = TextConverter(
|
||||
remove_numeric_tables=remove_numeric_tables,
|
||||
remove_whitespace=remove_whitespace,
|
||||
remove_empty_lines=remove_empty_lines,
|
||||
remove_header_footer=remove_header_footer,
|
||||
valid_languages=valid_languages,
|
||||
)
|
||||
pages = txt_converter.extract_pages(file_path)
|
||||
else:
|
||||
raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")
|
||||
|
||||
document = {TEXT_FIELD_NAME: "\n".join(pages), "name": file.filename}
|
||||
document_store.write_documents([document])
|
||||
|
||||
finally:
|
||||
file.file.close()
|
@ -1,8 +1,9 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from haystack.api.controller import search, feedback
|
||||
from haystack.api.controller import search, feedback, file_upload
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
router.include_router(search.router, tags=["search"])
|
||||
router.include_router(feedback.router, tags=["feedback"])
|
||||
router.include_router(file_upload.router, tags=["file-upload"])
|
||||
|
@ -1,6 +1,10 @@
|
||||
from abc import abstractmethod
|
||||
from functools import partial, reduce
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, Tuple, Generator, Set
|
||||
|
||||
import langdetect
|
||||
|
||||
|
||||
class BaseConverter:
|
||||
@ -43,3 +47,97 @@ class BaseConverter:
|
||||
@abstractmethod
|
||||
def extract_pages(self, file_path: Path) -> List[str]:
|
||||
pass
|
||||
|
||||
def validate_language(self, text: str) -> bool:
|
||||
"""
|
||||
Validate if the language of the text is one of valid languages.
|
||||
"""
|
||||
if not self.valid_languages:
|
||||
return True
|
||||
|
||||
try:
|
||||
lang = langdetect.detect(text)
|
||||
except langdetect.lang_detect_exception.LangDetectException:
|
||||
lang = None
|
||||
|
||||
if lang in self.valid_languages:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def find_and_remove_header_footer(
|
||||
self, pages: List[str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
|
||||
) -> Tuple[List[str], Optional[str], Optional[str]]:
|
||||
"""
|
||||
Heuristic to find footers and headers across different pages by searching for the longest common string.
|
||||
For headers we only search in the first n_chars characters (for footer: last n_chars).
|
||||
Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
|
||||
but won't detect "Page 3 of 4" or similar.
|
||||
|
||||
:param pages: list of strings, one string per page
|
||||
:param n_chars: number of first/last characters where the header/footer shall be searched in
|
||||
:param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
|
||||
:param n_last_pages_to_ignore: number of last pages to ignore
|
||||
:return: (cleaned pages, found_header_str, found_footer_str)
|
||||
"""
|
||||
|
||||
# header
|
||||
start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
|
||||
found_header = self._find_longest_common_ngram(start_of_pages)
|
||||
if found_header:
|
||||
pages = [page.replace(found_header, "") for page in pages]
|
||||
|
||||
# footer
|
||||
end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
|
||||
found_footer = self._find_longest_common_ngram(end_of_pages)
|
||||
if found_footer:
|
||||
pages = [page.replace(found_footer, "") for page in pages]
|
||||
return pages, found_header, found_footer
|
||||
|
||||
def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
|
||||
"""
|
||||
Return ngram (of tokens - currently splitted by whitespace)
|
||||
:param seq: str, string from which the ngram shall be created
|
||||
:param n: int, n of ngram
|
||||
:return: str, ngram as string
|
||||
"""
|
||||
|
||||
# In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
|
||||
# we add a space here and remove it after creation of the ngrams again (see below)
|
||||
seq = seq.replace("\n", " \n")
|
||||
seq = seq.replace("\t", " \t")
|
||||
|
||||
words = seq.split(" ")
|
||||
ngrams = (
|
||||
" ".join(words[i: i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
|
||||
)
|
||||
|
||||
return ngrams
|
||||
|
||||
def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
|
||||
lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
|
||||
ngrams = map(partial(self._ngram, seq), lengths)
|
||||
res = set(chain.from_iterable(ngrams))
|
||||
return res
|
||||
|
||||
def _find_longest_common_ngram(self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3) -> Optional[str]:
|
||||
"""
|
||||
Find the longest common ngram across different text sequences (e.g. start of pages).
|
||||
Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
|
||||
|
||||
:param sequences: list[str], list of strings that shall be searched for common n_grams
|
||||
:param max_ngram: int, maximum length of ngram to consider
|
||||
:param min_ngram: minimum length of ngram to consider
|
||||
:return: str, common string of all sections
|
||||
"""
|
||||
|
||||
seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
|
||||
intersection = reduce(set.intersection, seqs_ngrams)
|
||||
|
||||
try:
|
||||
longest = max(intersection, key=len)
|
||||
except ValueError:
|
||||
# no common sequence found
|
||||
longest = ""
|
||||
return longest if longest.strip() else None
|
||||
|
||||
|
@ -1,12 +1,8 @@
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
from functools import partial, reduce
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple, Generator, Set
|
||||
|
||||
import langdetect
|
||||
from typing import List, Optional
|
||||
|
||||
from haystack.indexing.file_converters.base import BaseConverter
|
||||
|
||||
@ -106,7 +102,7 @@ class PDFToTextConverter(BaseConverter):
|
||||
|
||||
if self.valid_languages:
|
||||
document_text = "".join(cleaned_pages)
|
||||
if not self._validate_language(document_text):
|
||||
if not self.validate_language(document_text):
|
||||
logger.warning(
|
||||
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
|
||||
f"been decoded in the correct text format."
|
||||
@ -138,95 +134,3 @@ class PDFToTextConverter(BaseConverter):
|
||||
pages = pages[:-1] # the last page in the split is always empty.
|
||||
return pages
|
||||
|
||||
def _validate_language(self, text: str) -> bool:
|
||||
"""
|
||||
Validate if the language of the text is one of valid languages.
|
||||
"""
|
||||
if not self.valid_languages:
|
||||
return True
|
||||
|
||||
try:
|
||||
lang = langdetect.detect(text)
|
||||
except langdetect.lang_detect_exception.LangDetectException:
|
||||
lang = None
|
||||
|
||||
if lang in self.valid_languages:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
|
||||
"""
|
||||
Return ngram (of tokens - currently splitted by whitespace)
|
||||
:param seq: str, string from which the ngram shall be created
|
||||
:param n: int, n of ngram
|
||||
:return: str, ngram as string
|
||||
"""
|
||||
|
||||
# In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
|
||||
# we add a space here and remove it after creation of the ngrams again (see below)
|
||||
seq = seq.replace("\n", " \n")
|
||||
seq = seq.replace("\t", " \t")
|
||||
|
||||
words = seq.split(" ")
|
||||
ngrams = (
|
||||
" ".join(words[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
|
||||
)
|
||||
|
||||
return ngrams
|
||||
|
||||
def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
|
||||
lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
|
||||
ngrams = map(partial(self._ngram, seq), lengths)
|
||||
res = set(chain.from_iterable(ngrams))
|
||||
return res
|
||||
|
||||
def find_longest_common_ngram(self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3) -> Optional[str]:
|
||||
"""
|
||||
Find the longest common ngram across different text sequences (e.g. start of pages).
|
||||
Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
|
||||
|
||||
:param sequences: list[str], list of strings that shall be searched for common n_grams
|
||||
:param max_ngram: int, maximum length of ngram to consider
|
||||
:param min_ngram: minimum length of ngram to consider
|
||||
:return: str, common string of all sections
|
||||
"""
|
||||
|
||||
seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
|
||||
intersection = reduce(set.intersection, seqs_ngrams)
|
||||
|
||||
try:
|
||||
longest = max(intersection, key=len)
|
||||
except ValueError:
|
||||
# no common sequence found
|
||||
longest = ""
|
||||
return longest if longest.strip() else None
|
||||
|
||||
def find_and_remove_header_footer(
|
||||
self, pages: List[str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
|
||||
) -> Tuple[List[str], Optional[str], Optional[str]]:
|
||||
"""
|
||||
Heuristic to find footers and headers across different pages by searching for the longest common string.
|
||||
For headers we only search in the first n_chars characters (for footer: last n_chars).
|
||||
Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
|
||||
but won't detect "Page 3 of 4" or similar.
|
||||
|
||||
:param pages: list of strings, one string per page
|
||||
:param n_chars: number of first/last characters where the header/footer shall be searched in
|
||||
:param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
|
||||
:param n_last_pages_to_ignore: number of last pages to ignore
|
||||
:return: (cleaned pages, found_header_str, found_footer_str)
|
||||
"""
|
||||
|
||||
# header
|
||||
start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
|
||||
found_header = self.find_longest_common_ngram(start_of_pages)
|
||||
if found_header:
|
||||
pages = [page.replace(found_header, "") for page in pages]
|
||||
|
||||
# footer
|
||||
end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
|
||||
found_footer = self.find_longest_common_ngram(end_of_pages)
|
||||
if found_footer:
|
||||
pages = [page.replace(found_footer, "") for page in pages]
|
||||
return pages, found_header, found_footer
|
93
haystack/indexing/file_converters/txt.py
Normal file
93
haystack/indexing/file_converters/txt.py
Normal file
@ -0,0 +1,93 @@
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from haystack.indexing.file_converters.base import BaseConverter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextConverter(BaseConverter):
|
||||
def __init__(
|
||||
self,
|
||||
remove_numeric_tables: Optional[bool] = False,
|
||||
remove_whitespace: Optional[bool] = None,
|
||||
remove_empty_lines: Optional[bool] = None,
|
||||
remove_header_footer: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for the reader model if it
|
||||
does not have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could possible candidate for searching answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
:param remove_whitespace: strip whitespaces before or after each line in the text.
|
||||
:param remove_empty_lines: remove more than two empty lines in the text.
|
||||
:param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
|
||||
for the longest common string. This heuristic uses exact matches and therefore
|
||||
works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
|
||||
or similar.
|
||||
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
"""
|
||||
|
||||
super().__init__(
|
||||
remove_numeric_tables=remove_numeric_tables,
|
||||
remove_whitespace=remove_whitespace,
|
||||
remove_empty_lines=remove_empty_lines,
|
||||
remove_header_footer=remove_header_footer,
|
||||
valid_languages=valid_languages,
|
||||
)
|
||||
|
||||
def extract_pages(self, file_path: Path) -> List[str]:
|
||||
with open(file_path) as f:
|
||||
text = f.read()
|
||||
pages = text.split("\f")
|
||||
|
||||
cleaned_pages = []
|
||||
for page in pages:
|
||||
lines = page.splitlines()
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
words = line.split()
|
||||
digits = [word for word in words if any(i.isdigit() for i in word)]
|
||||
|
||||
# remove lines having > 40% of words as digits AND not ending with a period(.)
|
||||
if self.remove_numeric_tables:
|
||||
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
|
||||
logger.debug(f"Removing line '{line}' from {file_path}")
|
||||
continue
|
||||
|
||||
if self.remove_whitespace:
|
||||
line = line.strip()
|
||||
|
||||
cleaned_lines.append(line)
|
||||
|
||||
page = "\n".join(cleaned_lines)
|
||||
|
||||
if self.remove_empty_lines:
|
||||
page = re.sub(r"\n\n+", "\n\n", page)
|
||||
|
||||
cleaned_pages.append(page)
|
||||
|
||||
if self.valid_languages:
|
||||
document_text = "".join(cleaned_pages)
|
||||
if not self.validate_language(document_text):
|
||||
logger.warning(
|
||||
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
|
||||
f"been decoded in the correct text format."
|
||||
)
|
||||
|
||||
if self.remove_header_footer:
|
||||
cleaned_pages, header, footer = self.find_and_remove_header_footer(
|
||||
cleaned_pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
|
||||
)
|
||||
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
|
||||
|
||||
return cleaned_pages
|
||||
|
@ -7,7 +7,7 @@ from typing import Callable, List, Optional
|
||||
|
||||
from farm.data_handler.utils import http_get
|
||||
|
||||
from haystack.indexing.file_converters.pdftotext import PDFToTextConverter
|
||||
from haystack.indexing.file_converters.pdf import PDFToTextConverter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from haystack.indexing.file_converters.pdftotext import PDFToTextConverter
|
||||
from haystack.indexing.file_converters.pdf import PDFToTextConverter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user