Add API endpoint to upload files (#154)

2025-08-29 02:46:39 +00:00 · 2020-06-17 16:28:26 +02:00 · 2020-06-17 16:28:26 +02:00 · a349eef0db
commit a349eef0db
parent 8bcc4b26a1
8 changed files with 288 additions and 102 deletions
--- a/haystack/api/config.py
+++ b/haystack/api/config.py
@ -41,6 +41,16 @@ if EXCLUDE_META_DATA_FIELDS:
 EMBEDDING_MODEL_PATH = os.getenv("EMBEDDING_MODEL_PATH", None)
 EMBEDDING_MODEL_FORMAT = os.getenv("EMBEDDING_MODEL_FORMAT", "farm")
 # File uploads
 FILE_UPLOAD_PATH = os.getenv("FILE_UPLOAD_PATH", "file-uploads")
 REMOVE_NUMERIC_TABLES = os.getenv("REMOVE_NUMERIC_TABLES", "True").lower() == "true"
 REMOVE_WHITESPACE = os.getenv("REMOVE_WHITESPACE", "True").lower() == "true"
 REMOVE_EMPTY_LINES = os.getenv("REMOVE_EMPTY_LINES", "True").lower() == "true"
 REMOVE_HEADER_FOOTER = os.getenv("REMOVE_HEADER_FOOTER", "True").lower() == "true"
 VALID_LANGUAGES = os.getenv("VALID_LANGUAGES", None)
 if VALID_LANGUAGES:
    VALID_LANGUAGES = ast.literal_eval(VALID_LANGUAGES)
 # Monitoring
 APM_SERVER = os.getenv("APM_SERVER", None)
 APM_SERVICE_NAME = os.getenv("APM_SERVICE_NAME", "haystack-backend")
--- a/haystack/api/controller/file_upload.py
+++ b/haystack/api/controller/file_upload.py
@ -0,0 +1,80 @@
 import logging
 import shutil
 import uuid
 from pathlib import Path
 from typing import Optional, List
 from fastapi import APIRouter
 from fastapi import HTTPException
 from fastapi import UploadFile, File, Form
 from haystack.api.config import DB_HOST, DB_PORT, DB_USER, DB_PW, DB_INDEX, ES_CONN_SCHEME, TEXT_FIELD_NAME, \
    SEARCH_FIELD_NAME, FILE_UPLOAD_PATH, EMBEDDING_DIM, EMBEDDING_FIELD_NAME, EXCLUDE_META_DATA_FIELDS, VALID_LANGUAGES, \
    FAQ_QUESTION_FIELD_NAME, REMOVE_NUMERIC_TABLES, REMOVE_WHITESPACE, REMOVE_EMPTY_LINES, REMOVE_HEADER_FOOTER
 from haystack.database.elasticsearch import ElasticsearchDocumentStore
 from haystack.indexing.file_converters.pdf import PDFToTextConverter
 from haystack.indexing.file_converters.txt import TextConverter
 logger = logging.getLogger(__name__)
 router = APIRouter()
 document_store = ElasticsearchDocumentStore(
    host=DB_HOST,
    port=DB_PORT,
    username=DB_USER,
    password=DB_PW,
    index=DB_INDEX,
    scheme=ES_CONN_SCHEME,
    ca_certs=False,
    verify_certs=False,
    text_field=TEXT_FIELD_NAME,
    search_fields=SEARCH_FIELD_NAME,
    embedding_dim=EMBEDDING_DIM,
    embedding_field=EMBEDDING_FIELD_NAME,
    excluded_meta_data=EXCLUDE_META_DATA_FIELDS,  # type: ignore
    faq_question_field=FAQ_QUESTION_FIELD_NAME,
 )
@router.post("/file-upload")
 def upload_file_to_document_store(
    file: UploadFile = File(...),
    remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES),
    remove_whitespace: Optional[bool] = Form(REMOVE_WHITESPACE),
    remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES),
    remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER),
    valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES),
 ) -> None:
    try:
        file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
        with file_path.open("wb") as buffer:
            shutil.copyfileobj(file.file, buffer)
        if file.filename.split(".")[-1].lower() == "pdf":
            pdf_converter = PDFToTextConverter(
                remove_numeric_tables=remove_numeric_tables,
                remove_whitespace=remove_whitespace,
                remove_empty_lines=remove_empty_lines,
                remove_header_footer=remove_header_footer,
                valid_languages=valid_languages,
            )
            pages = pdf_converter.extract_pages(file_path)
        elif file.filename.split(".")[-1].lower() == "txt":
            txt_converter = TextConverter(
                remove_numeric_tables=remove_numeric_tables,
                remove_whitespace=remove_whitespace,
                remove_empty_lines=remove_empty_lines,
                remove_header_footer=remove_header_footer,
                valid_languages=valid_languages,
            )
            pages = txt_converter.extract_pages(file_path)
        else:
            raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")
        document = {TEXT_FIELD_NAME: "\n".join(pages), "name": file.filename}
        document_store.write_documents([document])
    finally:
        file.file.close()
--- a/haystack/api/controller/router.py
+++ b/haystack/api/controller/router.py
@ -1,8 +1,9 @@
 from fastapi import APIRouter
-from haystack.api.controller import search, feedback
+from haystack.api.controller import search, feedback, file_upload
 router = APIRouter()
 router.include_router(search.router, tags=["search"])
 router.include_router(feedback.router, tags=["feedback"])
 router.include_router(file_upload.router, tags=["file-upload"])
--- a/haystack/indexing/file_converters/base.py
+++ b/haystack/indexing/file_converters/base.py
@ -1,6 +1,10 @@
 from abc import abstractmethod
 from functools import partial, reduce
 from itertools import chain
 from pathlib import Path
-from typing import List, Optional
+from typing import List, Optional, Tuple, Generator, Set
 import langdetect
 class BaseConverter:
@ -43,3 +47,97 @@ class BaseConverter:
    @abstractmethod
    def extract_pages(self, file_path: Path) -> List[str]:
        pass
    def validate_language(self, text: str) -> bool:
        """
        Validate if the language of the text is one of valid languages.
        """
        if not self.valid_languages:
            return True
        try:
            lang = langdetect.detect(text)
        except langdetect.lang_detect_exception.LangDetectException:
            lang = None
        if lang in self.valid_languages:
            return True
        else:
            return False
    def find_and_remove_header_footer(
            self, pages: List[str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
    ) -> Tuple[List[str], Optional[str], Optional[str]]:
        """
        Heuristic to find footers and headers across different pages by searching for the longest common string.
        For headers we only search in the first n_chars characters (for footer: last n_chars).
        Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
         but won't detect "Page 3 of 4" or similar.
        :param pages: list of strings, one string per page
        :param n_chars: number of first/last characters where the header/footer shall be searched in
        :param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
        :param n_last_pages_to_ignore: number of last pages to ignore
        :return: (cleaned pages, found_header_str, found_footer_str)
        """
        # header
        start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
        found_header = self._find_longest_common_ngram(start_of_pages)
        if found_header:
            pages = [page.replace(found_header, "") for page in pages]
        # footer
        end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
        found_footer = self._find_longest_common_ngram(end_of_pages)
        if found_footer:
            pages = [page.replace(found_footer, "") for page in pages]
        return pages, found_header, found_footer
    def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
        """
        Return ngram (of tokens - currently splitted by whitespace)
        :param seq: str, string from which the ngram shall be created
        :param n: int, n of ngram
        :return: str, ngram as string
        """
        # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
        # we add a space here and remove it after creation of the ngrams again (see below)
        seq = seq.replace("\n", " \n")
        seq = seq.replace("\t", " \t")
        words = seq.split(" ")
        ngrams = (
            " ".join(words[i: i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
        )
        return ngrams
    def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
        lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
        ngrams = map(partial(self._ngram, seq), lengths)
        res = set(chain.from_iterable(ngrams))
        return res
    def _find_longest_common_ngram(self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3) -> Optional[str]:
        """
        Find the longest common ngram across different text sequences (e.g. start of pages).
        Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
        :param sequences: list[str], list of strings that shall be searched for common n_grams
        :param max_ngram: int, maximum length of ngram to consider
        :param min_ngram: minimum length of ngram to consider
        :return: str, common string of all sections
        """
        seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
        intersection = reduce(set.intersection, seqs_ngrams)
        try:
            longest = max(intersection, key=len)
        except ValueError:
            # no common sequence found
            longest = ""
        return longest if longest.strip() else None
--- a/haystack/indexing/file_converters/pdftotext.py
+++ b/haystack/indexing/file_converters/pdftotext.py
@ -1,12 +1,8 @@
 import logging
 import re
 import subprocess
 from functools import partial, reduce
 from itertools import chain
 from pathlib import Path
-from typing import List, Optional, Tuple, Generator, Set
+from typing import List, Optional
 import langdetect
 from haystack.indexing.file_converters.base import BaseConverter
@ -106,7 +102,7 @@ class PDFToTextConverter(BaseConverter):
        if self.valid_languages:
            document_text = "".join(cleaned_pages)
-            if not self._validate_language(document_text):
+            if not self.validate_language(document_text):
                logger.warning(
                    f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
                    f"been decoded in the correct text format."
@ -138,95 +134,3 @@ class PDFToTextConverter(BaseConverter):
        pages = pages[:-1]  # the last page in the split is always empty.
        return pages
    def _validate_language(self, text: str) -> bool:
        """
        Validate if the language of the text is one of valid languages.
        """
        if not self.valid_languages:
            return True
        try:
            lang = langdetect.detect(text)
        except langdetect.lang_detect_exception.LangDetectException:
            lang = None
        if lang in self.valid_languages:
            return True
        else:
            return False
    def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
        """
        Return ngram (of tokens - currently splitted by whitespace)
        :param seq: str, string from which the ngram shall be created
        :param n: int, n of ngram
        :return: str, ngram as string
        """
        # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
        # we add a space here and remove it after creation of the ngrams again (see below)
        seq = seq.replace("\n", " \n")
        seq = seq.replace("\t", " \t")
        words = seq.split(" ")
        ngrams = (
            " ".join(words[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
        )
        return ngrams
    def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
        lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
        ngrams = map(partial(self._ngram, seq), lengths)
        res = set(chain.from_iterable(ngrams))
        return res
    def find_longest_common_ngram(self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3) -> Optional[str]:
        """
        Find the longest common ngram across different text sequences (e.g. start of pages).
        Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
        :param sequences: list[str], list of strings that shall be searched for common n_grams
        :param max_ngram: int, maximum length of ngram to consider
        :param min_ngram: minimum length of ngram to consider
        :return: str, common string of all sections
        """
        seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
        intersection = reduce(set.intersection, seqs_ngrams)
        try:
            longest = max(intersection, key=len)
        except ValueError:
            # no common sequence found
            longest = ""
        return longest if longest.strip() else None
    def find_and_remove_header_footer(
        self, pages: List[str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
    ) -> Tuple[List[str], Optional[str], Optional[str]]:
        """
        Heuristic to find footers and headers across different pages by searching for the longest common string.
        For headers we only search in the first n_chars characters (for footer: last n_chars).
        Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
         but won't detect "Page 3 of 4" or similar.
        :param pages: list of strings, one string per page
        :param n_chars: number of first/last characters where the header/footer shall be searched in
        :param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
        :param n_last_pages_to_ignore: number of last pages to ignore
        :return: (cleaned pages, found_header_str, found_footer_str)
        """
        # header
        start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
        found_header = self.find_longest_common_ngram(start_of_pages)
        if found_header:
            pages = [page.replace(found_header, "") for page in pages]
        # footer
        end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
        found_footer = self.find_longest_common_ngram(end_of_pages)
        if found_footer:
            pages = [page.replace(found_footer, "") for page in pages]
        return pages, found_header, found_footer
--- a/haystack/indexing/file_converters/txt.py
+++ b/haystack/indexing/file_converters/txt.py
@ -0,0 +1,93 @@
 import logging
 import re
 from pathlib import Path
 from typing import List, Optional
 from haystack.indexing.file_converters.base import BaseConverter
 logger = logging.getLogger(__name__)
 class TextConverter(BaseConverter):
    def __init__(
            self,
            remove_numeric_tables: Optional[bool] = False,
            remove_whitespace: Optional[bool] = None,
            remove_empty_lines: Optional[bool] = None,
            remove_header_footer: Optional[bool] = None,
            valid_languages: Optional[List[str]] = None,
    ):
        """
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param remove_whitespace: strip whitespaces before or after each line in the text.
        :param remove_empty_lines: remove more than two empty lines in the text.
        :param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
                                     for the longest common string. This heuristic uses exact matches and therefore
                                     works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
                                     or similar.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        """
        super().__init__(
            remove_numeric_tables=remove_numeric_tables,
            remove_whitespace=remove_whitespace,
            remove_empty_lines=remove_empty_lines,
            remove_header_footer=remove_header_footer,
            valid_languages=valid_languages,
        )
    def extract_pages(self, file_path: Path) -> List[str]:
        with open(file_path) as f:
            text = f.read()
            pages = text.split("\f")
        cleaned_pages = []
        for page in pages:
            lines = page.splitlines()
            cleaned_lines = []
            for line in lines:
                words = line.split()
                digits = [word for word in words if any(i.isdigit() for i in word)]
                # remove lines having > 40% of words as digits AND not ending with a period(.)
                if self.remove_numeric_tables:
                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
                        logger.debug(f"Removing line '{line}' from {file_path}")
                        continue
                if self.remove_whitespace:
                    line = line.strip()
                cleaned_lines.append(line)
            page = "\n".join(cleaned_lines)
            if self.remove_empty_lines:
                page = re.sub(r"\n\n+", "\n\n", page)
            cleaned_pages.append(page)
        if self.valid_languages:
            document_text = "".join(cleaned_pages)
            if not self.validate_language(document_text):
                logger.warning(
                    f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
                    f"been decoded in the correct text format."
                )
        if self.remove_header_footer:
            cleaned_pages, header, footer = self.find_and_remove_header_footer(
                cleaned_pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
            )
            logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
        return cleaned_pages
--- a/haystack/indexing/utils.py
+++ b/haystack/indexing/utils.py
@ -7,7 +7,7 @@ from typing import Callable, List, Optional
 from farm.data_handler.utils import http_get
-from haystack.indexing.file_converters.pdftotext import PDFToTextConverter
+from haystack.indexing.file_converters.pdf import PDFToTextConverter
 logger = logging.getLogger(__name__)
--- a/test/test_pdf_conversion.py
+++ b/test/test_pdf_conversion.py
@ -1,7 +1,7 @@
 import logging
 from pathlib import Path
-from haystack.indexing.file_converters.pdftotext import PDFToTextConverter
+from haystack.indexing.file_converters.pdf import PDFToTextConverter
 logger = logging.getLogger(__name__)