diff --git a/haystack/api/config.py b/haystack/api/config.py index 6c489dd81..f0b2670d2 100644 --- a/haystack/api/config.py +++ b/haystack/api/config.py @@ -41,6 +41,16 @@ if EXCLUDE_META_DATA_FIELDS: EMBEDDING_MODEL_PATH = os.getenv("EMBEDDING_MODEL_PATH", None) EMBEDDING_MODEL_FORMAT = os.getenv("EMBEDDING_MODEL_FORMAT", "farm") +# File uploads +FILE_UPLOAD_PATH = os.getenv("FILE_UPLOAD_PATH", "file-uploads") +REMOVE_NUMERIC_TABLES = os.getenv("REMOVE_NUMERIC_TABLES", "True").lower() == "true" +REMOVE_WHITESPACE = os.getenv("REMOVE_WHITESPACE", "True").lower() == "true" +REMOVE_EMPTY_LINES = os.getenv("REMOVE_EMPTY_LINES", "True").lower() == "true" +REMOVE_HEADER_FOOTER = os.getenv("REMOVE_HEADER_FOOTER", "True").lower() == "true" +VALID_LANGUAGES = os.getenv("VALID_LANGUAGES", None) +if VALID_LANGUAGES: + VALID_LANGUAGES = ast.literal_eval(VALID_LANGUAGES) + # Monitoring APM_SERVER = os.getenv("APM_SERVER", None) APM_SERVICE_NAME = os.getenv("APM_SERVICE_NAME", "haystack-backend") diff --git a/haystack/api/controller/file_upload.py b/haystack/api/controller/file_upload.py new file mode 100644 index 000000000..9ffecc1b6 --- /dev/null +++ b/haystack/api/controller/file_upload.py @@ -0,0 +1,80 @@ +import logging +import shutil +import uuid +from pathlib import Path +from typing import Optional, List + +from fastapi import APIRouter +from fastapi import HTTPException +from fastapi import UploadFile, File, Form + +from haystack.api.config import DB_HOST, DB_PORT, DB_USER, DB_PW, DB_INDEX, ES_CONN_SCHEME, TEXT_FIELD_NAME, \ + SEARCH_FIELD_NAME, FILE_UPLOAD_PATH, EMBEDDING_DIM, EMBEDDING_FIELD_NAME, EXCLUDE_META_DATA_FIELDS, VALID_LANGUAGES, \ + FAQ_QUESTION_FIELD_NAME, REMOVE_NUMERIC_TABLES, REMOVE_WHITESPACE, REMOVE_EMPTY_LINES, REMOVE_HEADER_FOOTER +from haystack.database.elasticsearch import ElasticsearchDocumentStore +from haystack.indexing.file_converters.pdf import PDFToTextConverter +from haystack.indexing.file_converters.txt import TextConverter + + +logger = logging.getLogger(__name__) +router = APIRouter() + + +document_store = ElasticsearchDocumentStore( + host=DB_HOST, + port=DB_PORT, + username=DB_USER, + password=DB_PW, + index=DB_INDEX, + scheme=ES_CONN_SCHEME, + ca_certs=False, + verify_certs=False, + text_field=TEXT_FIELD_NAME, + search_fields=SEARCH_FIELD_NAME, + embedding_dim=EMBEDDING_DIM, + embedding_field=EMBEDDING_FIELD_NAME, + excluded_meta_data=EXCLUDE_META_DATA_FIELDS, # type: ignore + faq_question_field=FAQ_QUESTION_FIELD_NAME, +) + + +@router.post("/file-upload") +def upload_file_to_document_store( + file: UploadFile = File(...), + remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES), + remove_whitespace: Optional[bool] = Form(REMOVE_WHITESPACE), + remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES), + remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER), + valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES), +) -> None: + try: + file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}" + with file_path.open("wb") as buffer: + shutil.copyfileobj(file.file, buffer) + + if file.filename.split(".")[-1].lower() == "pdf": + pdf_converter = PDFToTextConverter( + remove_numeric_tables=remove_numeric_tables, + remove_whitespace=remove_whitespace, + remove_empty_lines=remove_empty_lines, + remove_header_footer=remove_header_footer, + valid_languages=valid_languages, + ) + pages = pdf_converter.extract_pages(file_path) + elif file.filename.split(".")[-1].lower() == "txt": + txt_converter = TextConverter( + remove_numeric_tables=remove_numeric_tables, + remove_whitespace=remove_whitespace, + remove_empty_lines=remove_empty_lines, + remove_header_footer=remove_header_footer, + valid_languages=valid_languages, + ) + pages = txt_converter.extract_pages(file_path) + else: + raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.") + + document = {TEXT_FIELD_NAME: "\n".join(pages), "name": file.filename} + document_store.write_documents([document]) + + finally: + file.file.close() diff --git a/haystack/api/controller/router.py b/haystack/api/controller/router.py index 157cff570..725bd5238 100644 --- a/haystack/api/controller/router.py +++ b/haystack/api/controller/router.py @@ -1,8 +1,9 @@ from fastapi import APIRouter -from haystack.api.controller import search, feedback +from haystack.api.controller import search, feedback, file_upload router = APIRouter() router.include_router(search.router, tags=["search"]) router.include_router(feedback.router, tags=["feedback"]) +router.include_router(file_upload.router, tags=["file-upload"]) diff --git a/haystack/indexing/file_converters/base.py b/haystack/indexing/file_converters/base.py index 89d8e6d16..9949bdb3a 100644 --- a/haystack/indexing/file_converters/base.py +++ b/haystack/indexing/file_converters/base.py @@ -1,6 +1,10 @@ from abc import abstractmethod +from functools import partial, reduce +from itertools import chain from pathlib import Path -from typing import List, Optional +from typing import List, Optional, Tuple, Generator, Set + +import langdetect class BaseConverter: @@ -43,3 +47,97 @@ class BaseConverter: @abstractmethod def extract_pages(self, file_path: Path) -> List[str]: pass + + def validate_language(self, text: str) -> bool: + """ + Validate if the language of the text is one of valid languages. + """ + if not self.valid_languages: + return True + + try: + lang = langdetect.detect(text) + except langdetect.lang_detect_exception.LangDetectException: + lang = None + + if lang in self.valid_languages: + return True + else: + return False + + def find_and_remove_header_footer( + self, pages: List[str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int + ) -> Tuple[List[str], Optional[str], Optional[str]]: + """ + Heuristic to find footers and headers across different pages by searching for the longest common string. + For headers we only search in the first n_chars characters (for footer: last n_chars). + Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX", + but won't detect "Page 3 of 4" or similar. + + :param pages: list of strings, one string per page + :param n_chars: number of first/last characters where the header/footer shall be searched in + :param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header) + :param n_last_pages_to_ignore: number of last pages to ignore + :return: (cleaned pages, found_header_str, found_footer_str) + """ + + # header + start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]] + found_header = self._find_longest_common_ngram(start_of_pages) + if found_header: + pages = [page.replace(found_header, "") for page in pages] + + # footer + end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]] + found_footer = self._find_longest_common_ngram(end_of_pages) + if found_footer: + pages = [page.replace(found_footer, "") for page in pages] + return pages, found_header, found_footer + + def _ngram(self, seq: str, n: int) -> Generator[str, None, None]: + """ + Return ngram (of tokens - currently splitted by whitespace) + :param seq: str, string from which the ngram shall be created + :param n: int, n of ngram + :return: str, ngram as string + """ + + # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization, + # we add a space here and remove it after creation of the ngrams again (see below) + seq = seq.replace("\n", " \n") + seq = seq.replace("\t", " \t") + + words = seq.split(" ") + ngrams = ( + " ".join(words[i: i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1) + ) + + return ngrams + + def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]: + lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq)) + ngrams = map(partial(self._ngram, seq), lengths) + res = set(chain.from_iterable(ngrams)) + return res + + def _find_longest_common_ngram(self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3) -> Optional[str]: + """ + Find the longest common ngram across different text sequences (e.g. start of pages). + Considering all ngrams between the specified range. Helpful for finding footers, headers etc. + + :param sequences: list[str], list of strings that shall be searched for common n_grams + :param max_ngram: int, maximum length of ngram to consider + :param min_ngram: minimum length of ngram to consider + :return: str, common string of all sections + """ + + seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences) + intersection = reduce(set.intersection, seqs_ngrams) + + try: + longest = max(intersection, key=len) + except ValueError: + # no common sequence found + longest = "" + return longest if longest.strip() else None + diff --git a/haystack/indexing/file_converters/pdftotext.py b/haystack/indexing/file_converters/pdf.py similarity index 59% rename from haystack/indexing/file_converters/pdftotext.py rename to haystack/indexing/file_converters/pdf.py index 82cd55a0e..493f28ac6 100644 --- a/haystack/indexing/file_converters/pdftotext.py +++ b/haystack/indexing/file_converters/pdf.py @@ -1,12 +1,8 @@ import logging import re import subprocess -from functools import partial, reduce -from itertools import chain from pathlib import Path -from typing import List, Optional, Tuple, Generator, Set - -import langdetect +from typing import List, Optional from haystack.indexing.file_converters.base import BaseConverter @@ -106,7 +102,7 @@ class PDFToTextConverter(BaseConverter): if self.valid_languages: document_text = "".join(cleaned_pages) - if not self._validate_language(document_text): + if not self.validate_language(document_text): logger.warning( f"The language for {file_path} is not one of {self.valid_languages}. The file may not have " f"been decoded in the correct text format." @@ -138,95 +134,3 @@ class PDFToTextConverter(BaseConverter): pages = pages[:-1] # the last page in the split is always empty. return pages - def _validate_language(self, text: str) -> bool: - """ - Validate if the language of the text is one of valid languages. - """ - if not self.valid_languages: - return True - - try: - lang = langdetect.detect(text) - except langdetect.lang_detect_exception.LangDetectException: - lang = None - - if lang in self.valid_languages: - return True - else: - return False - - def _ngram(self, seq: str, n: int) -> Generator[str, None, None]: - """ - Return ngram (of tokens - currently splitted by whitespace) - :param seq: str, string from which the ngram shall be created - :param n: int, n of ngram - :return: str, ngram as string - """ - - # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization, - # we add a space here and remove it after creation of the ngrams again (see below) - seq = seq.replace("\n", " \n") - seq = seq.replace("\t", " \t") - - words = seq.split(" ") - ngrams = ( - " ".join(words[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1) - ) - - return ngrams - - def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]: - lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq)) - ngrams = map(partial(self._ngram, seq), lengths) - res = set(chain.from_iterable(ngrams)) - return res - - def find_longest_common_ngram(self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3) -> Optional[str]: - """ - Find the longest common ngram across different text sequences (e.g. start of pages). - Considering all ngrams between the specified range. Helpful for finding footers, headers etc. - - :param sequences: list[str], list of strings that shall be searched for common n_grams - :param max_ngram: int, maximum length of ngram to consider - :param min_ngram: minimum length of ngram to consider - :return: str, common string of all sections - """ - - seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences) - intersection = reduce(set.intersection, seqs_ngrams) - - try: - longest = max(intersection, key=len) - except ValueError: - # no common sequence found - longest = "" - return longest if longest.strip() else None - - def find_and_remove_header_footer( - self, pages: List[str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int - ) -> Tuple[List[str], Optional[str], Optional[str]]: - """ - Heuristic to find footers and headers across different pages by searching for the longest common string. - For headers we only search in the first n_chars characters (for footer: last n_chars). - Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX", - but won't detect "Page 3 of 4" or similar. - - :param pages: list of strings, one string per page - :param n_chars: number of first/last characters where the header/footer shall be searched in - :param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header) - :param n_last_pages_to_ignore: number of last pages to ignore - :return: (cleaned pages, found_header_str, found_footer_str) - """ - - # header - start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]] - found_header = self.find_longest_common_ngram(start_of_pages) - if found_header: - pages = [page.replace(found_header, "") for page in pages] - - # footer - end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]] - found_footer = self.find_longest_common_ngram(end_of_pages) - if found_footer: - pages = [page.replace(found_footer, "") for page in pages] - return pages, found_header, found_footer diff --git a/haystack/indexing/file_converters/txt.py b/haystack/indexing/file_converters/txt.py new file mode 100644 index 000000000..690d11624 --- /dev/null +++ b/haystack/indexing/file_converters/txt.py @@ -0,0 +1,93 @@ +import logging +import re +from pathlib import Path +from typing import List, Optional + +from haystack.indexing.file_converters.base import BaseConverter + +logger = logging.getLogger(__name__) + + +class TextConverter(BaseConverter): + def __init__( + self, + remove_numeric_tables: Optional[bool] = False, + remove_whitespace: Optional[bool] = None, + remove_empty_lines: Optional[bool] = None, + remove_header_footer: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + ): + """ + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param remove_whitespace: strip whitespaces before or after each line in the text. + :param remove_empty_lines: remove more than two empty lines in the text. + :param remove_header_footer: use heuristic to remove footers and headers across different pages by searching + for the longest common string. This heuristic uses exact matches and therefore + works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4" + or similar. + :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. + """ + + super().__init__( + remove_numeric_tables=remove_numeric_tables, + remove_whitespace=remove_whitespace, + remove_empty_lines=remove_empty_lines, + remove_header_footer=remove_header_footer, + valid_languages=valid_languages, + ) + + def extract_pages(self, file_path: Path) -> List[str]: + with open(file_path) as f: + text = f.read() + pages = text.split("\f") + + cleaned_pages = [] + for page in pages: + lines = page.splitlines() + cleaned_lines = [] + for line in lines: + words = line.split() + digits = [word for word in words if any(i.isdigit() for i in word)] + + # remove lines having > 40% of words as digits AND not ending with a period(.) + if self.remove_numeric_tables: + if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."): + logger.debug(f"Removing line '{line}' from {file_path}") + continue + + if self.remove_whitespace: + line = line.strip() + + cleaned_lines.append(line) + + page = "\n".join(cleaned_lines) + + if self.remove_empty_lines: + page = re.sub(r"\n\n+", "\n\n", page) + + cleaned_pages.append(page) + + if self.valid_languages: + document_text = "".join(cleaned_pages) + if not self.validate_language(document_text): + logger.warning( + f"The language for {file_path} is not one of {self.valid_languages}. The file may not have " + f"been decoded in the correct text format." + ) + + if self.remove_header_footer: + cleaned_pages, header, footer = self.find_and_remove_header_footer( + cleaned_pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1 + ) + logger.info(f"Removed header '{header}' and footer {footer} in {file_path}") + + return cleaned_pages + diff --git a/haystack/indexing/utils.py b/haystack/indexing/utils.py index 517d6b159..9f8dd1edf 100644 --- a/haystack/indexing/utils.py +++ b/haystack/indexing/utils.py @@ -7,7 +7,7 @@ from typing import Callable, List, Optional from farm.data_handler.utils import http_get -from haystack.indexing.file_converters.pdftotext import PDFToTextConverter +from haystack.indexing.file_converters.pdf import PDFToTextConverter logger = logging.getLogger(__name__) diff --git a/test/test_pdf_conversion.py b/test/test_pdf_conversion.py index 130caa5f8..f95ccbe2c 100644 --- a/test/test_pdf_conversion.py +++ b/test/test_pdf_conversion.py @@ -1,7 +1,7 @@ import logging from pathlib import Path -from haystack.indexing.file_converters.pdftotext import PDFToTextConverter +from haystack.indexing.file_converters.pdf import PDFToTextConverter logger = logging.getLogger(__name__)