Add API endpoint to upload files (#154)

2025-10-12 16:38:40 +00:00 · 2020-06-17 16:28:26 +02:00 · 2020-06-17 16:28:26 +02:00 · a349eef0db
commit a349eef0db
parent 8bcc4b26a1
8 changed files with 288 additions and 102 deletions
--- a/haystack/api/config.py
+++ b/haystack/api/config.py
@ -41,6 +41,16 @@ if EXCLUDE_META_DATA_FIELDS:
 EMBEDDING_MODEL_PATH = os.getenv("EMBEDDING_MODEL_PATH", None)
 EMBEDDING_MODEL_FORMAT = os.getenv("EMBEDDING_MODEL_FORMAT", "farm")

+# File uploads
+FILE_UPLOAD_PATH = os.getenv("FILE_UPLOAD_PATH", "file-uploads")
+REMOVE_NUMERIC_TABLES = os.getenv("REMOVE_NUMERIC_TABLES", "True").lower() == "true"
+REMOVE_WHITESPACE = os.getenv("REMOVE_WHITESPACE", "True").lower() == "true"
+REMOVE_EMPTY_LINES = os.getenv("REMOVE_EMPTY_LINES", "True").lower() == "true"
+REMOVE_HEADER_FOOTER = os.getenv("REMOVE_HEADER_FOOTER", "True").lower() == "true"
+VALID_LANGUAGES = os.getenv("VALID_LANGUAGES", None)
+if VALID_LANGUAGES:
+    VALID_LANGUAGES = ast.literal_eval(VALID_LANGUAGES)
+
 # Monitoring
 APM_SERVER = os.getenv("APM_SERVER", None)
 APM_SERVICE_NAME = os.getenv("APM_SERVICE_NAME", "haystack-backend")
--- a/haystack/api/controller/file_upload.py
+++ b/haystack/api/controller/file_upload.py
@ -0,0 +1,80 @@
+import logging
+import shutil
+import uuid
+from pathlib import Path
+from typing import Optional, List
+
+from fastapi import APIRouter
+from fastapi import HTTPException
+from fastapi import UploadFile, File, Form
+
+from haystack.api.config import DB_HOST, DB_PORT, DB_USER, DB_PW, DB_INDEX, ES_CONN_SCHEME, TEXT_FIELD_NAME, \
+    SEARCH_FIELD_NAME, FILE_UPLOAD_PATH, EMBEDDING_DIM, EMBEDDING_FIELD_NAME, EXCLUDE_META_DATA_FIELDS, VALID_LANGUAGES, \
+    FAQ_QUESTION_FIELD_NAME, REMOVE_NUMERIC_TABLES, REMOVE_WHITESPACE, REMOVE_EMPTY_LINES, REMOVE_HEADER_FOOTER
+from haystack.database.elasticsearch import ElasticsearchDocumentStore
+from haystack.indexing.file_converters.pdf import PDFToTextConverter
+from haystack.indexing.file_converters.txt import TextConverter
+
+
+logger = logging.getLogger(__name__)
+router = APIRouter()
+
+
+document_store = ElasticsearchDocumentStore(
+    host=DB_HOST,
+    port=DB_PORT,
+    username=DB_USER,
+    password=DB_PW,
+    index=DB_INDEX,
+    scheme=ES_CONN_SCHEME,
+    ca_certs=False,
+    verify_certs=False,
+    text_field=TEXT_FIELD_NAME,
+    search_fields=SEARCH_FIELD_NAME,
+    embedding_dim=EMBEDDING_DIM,
+    embedding_field=EMBEDDING_FIELD_NAME,
+    excluded_meta_data=EXCLUDE_META_DATA_FIELDS,  # type: ignore
+    faq_question_field=FAQ_QUESTION_FIELD_NAME,
+)
+
+
+@router.post("/file-upload")
+def upload_file_to_document_store(
+    file: UploadFile = File(...),
+    remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES),
+    remove_whitespace: Optional[bool] = Form(REMOVE_WHITESPACE),
+    remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES),
+    remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER),
+    valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES),
+) -> None:
+    try:
+        file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
+        with file_path.open("wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+
+        if file.filename.split(".")[-1].lower() == "pdf":
+            pdf_converter = PDFToTextConverter(
+                remove_numeric_tables=remove_numeric_tables,
+                remove_whitespace=remove_whitespace,
+                remove_empty_lines=remove_empty_lines,
+                remove_header_footer=remove_header_footer,
+                valid_languages=valid_languages,
+            )
+            pages = pdf_converter.extract_pages(file_path)
+        elif file.filename.split(".")[-1].lower() == "txt":
+            txt_converter = TextConverter(
+                remove_numeric_tables=remove_numeric_tables,
+                remove_whitespace=remove_whitespace,
+                remove_empty_lines=remove_empty_lines,
+                remove_header_footer=remove_header_footer,
+                valid_languages=valid_languages,
+            )
+            pages = txt_converter.extract_pages(file_path)
+        else:
+            raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")
+
+        document = {TEXT_FIELD_NAME: "\n".join(pages), "name": file.filename}
+        document_store.write_documents([document])
+
+    finally:
+        file.file.close()
--- a/haystack/api/controller/router.py
+++ b/haystack/api/controller/router.py
@ -1,8 +1,9 @@
 from fastapi import APIRouter

-from haystack.api.controller import search, feedback
+from haystack.api.controller import search, feedback, file_upload

 router = APIRouter()

 router.include_router(search.router, tags=["search"])
 router.include_router(feedback.router, tags=["feedback"])
+router.include_router(file_upload.router, tags=["file-upload"])
--- a/haystack/indexing/file_converters/base.py
+++ b/haystack/indexing/file_converters/base.py
@ -1,6 +1,10 @@
 from abc import abstractmethod
+from functools import partial, reduce
+from itertools import chain
 from pathlib import Path
-from typing import List, Optional
+from typing import List, Optional, Tuple, Generator, Set
+
+import langdetect


 class BaseConverter:
@ -43,3 +47,97 @@ class BaseConverter:
    @abstractmethod
    def extract_pages(self, file_path: Path) -> List[str]:
        pass
+
+    def validate_language(self, text: str) -> bool:
+        """
+        Validate if the language of the text is one of valid languages.
+        """
+        if not self.valid_languages:
+            return True
+
+        try:
+            lang = langdetect.detect(text)
+        except langdetect.lang_detect_exception.LangDetectException:
+            lang = None
+
+        if lang in self.valid_languages:
+            return True
+        else:
+            return False
+
+    def find_and_remove_header_footer(
+            self, pages: List[str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
+    ) -> Tuple[List[str], Optional[str], Optional[str]]:
+        """
+        Heuristic to find footers and headers across different pages by searching for the longest common string.
+        For headers we only search in the first n_chars characters (for footer: last n_chars).
+        Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
+         but won't detect "Page 3 of 4" or similar.
+
+        :param pages: list of strings, one string per page
+        :param n_chars: number of first/last characters where the header/footer shall be searched in
+        :param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
+        :param n_last_pages_to_ignore: number of last pages to ignore
+        :return: (cleaned pages, found_header_str, found_footer_str)
+        """
+
+        # header
+        start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
+        found_header = self._find_longest_common_ngram(start_of_pages)
+        if found_header:
+            pages = [page.replace(found_header, "") for page in pages]
+
+        # footer
+        end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
+        found_footer = self._find_longest_common_ngram(end_of_pages)
+        if found_footer:
+            pages = [page.replace(found_footer, "") for page in pages]
+        return pages, found_header, found_footer
+
+    def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
+        """
+        Return ngram (of tokens - currently splitted by whitespace)
+        :param seq: str, string from which the ngram shall be created
+        :param n: int, n of ngram
+        :return: str, ngram as string
+        """
+
+        # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
+        # we add a space here and remove it after creation of the ngrams again (see below)
+        seq = seq.replace("\n", " \n")
+        seq = seq.replace("\t", " \t")
+
+        words = seq.split(" ")
+        ngrams = (
+            " ".join(words[i: i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
+        )
+
+        return ngrams
+
+    def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
+        lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
+        ngrams = map(partial(self._ngram, seq), lengths)
+        res = set(chain.from_iterable(ngrams))
+        return res
+
+    def _find_longest_common_ngram(self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3) -> Optional[str]:
+        """
+        Find the longest common ngram across different text sequences (e.g. start of pages).
+        Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
+
+        :param sequences: list[str], list of strings that shall be searched for common n_grams
+        :param max_ngram: int, maximum length of ngram to consider
+        :param min_ngram: minimum length of ngram to consider
+        :return: str, common string of all sections
+        """
+
+        seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
+        intersection = reduce(set.intersection, seqs_ngrams)
+
+        try:
+            longest = max(intersection, key=len)
+        except ValueError:
+            # no common sequence found
+            longest = ""
+        return longest if longest.strip() else None
+
--- a/haystack/indexing/file_converters/pdftotext.py
+++ b/haystack/indexing/file_converters/pdftotext.py
@ -1,12 +1,8 @@
 import logging
 import re
 import subprocess
-from functools import partial, reduce
-from itertools import chain
 from pathlib import Path
-from typing import List, Optional, Tuple, Generator, Set
-
-import langdetect
+from typing import List, Optional

 from haystack.indexing.file_converters.base import BaseConverter

@ -106,7 +102,7 @@ class PDFToTextConverter(BaseConverter):

        if self.valid_languages:
            document_text = "".join(cleaned_pages)
-            if not self._validate_language(document_text):
+            if not self.validate_language(document_text):
                logger.warning(
                    f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
                    f"been decoded in the correct text format."
@ -138,95 +134,3 @@ class PDFToTextConverter(BaseConverter):
        pages = pages[:-1]  # the last page in the split is always empty.
        return pages

-    def _validate_language(self, text: str) -> bool:
-        """
-        Validate if the language of the text is one of valid languages.
-        """
-        if not self.valid_languages:
-            return True
-
-        try:
-            lang = langdetect.detect(text)
-        except langdetect.lang_detect_exception.LangDetectException:
-            lang = None
-
-        if lang in self.valid_languages:
-            return True
-        else:
-            return False
-
-    def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
-        """
-        Return ngram (of tokens - currently splitted by whitespace)
-        :param seq: str, string from which the ngram shall be created
-        :param n: int, n of ngram
-        :return: str, ngram as string
-        """
-
-        # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
-        # we add a space here and remove it after creation of the ngrams again (see below)
-        seq = seq.replace("\n", " \n")
-        seq = seq.replace("\t", " \t")
-
-        words = seq.split(" ")
-        ngrams = (
-            " ".join(words[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
-        )
-
-        return ngrams
-
-    def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
-        lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
-        ngrams = map(partial(self._ngram, seq), lengths)
-        res = set(chain.from_iterable(ngrams))
-        return res
-
-    def find_longest_common_ngram(self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3) -> Optional[str]:
-        """
-        Find the longest common ngram across different text sequences (e.g. start of pages).
-        Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
-
-        :param sequences: list[str], list of strings that shall be searched for common n_grams
-        :param max_ngram: int, maximum length of ngram to consider
-        :param min_ngram: minimum length of ngram to consider
-        :return: str, common string of all sections
-        """
-
-        seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
-        intersection = reduce(set.intersection, seqs_ngrams)
-
-        try:
-            longest = max(intersection, key=len)
-        except ValueError:
-            # no common sequence found
-            longest = ""
-        return longest if longest.strip() else None
-
-    def find_and_remove_header_footer(
-        self, pages: List[str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
-    ) -> Tuple[List[str], Optional[str], Optional[str]]:
-        """
-        Heuristic to find footers and headers across different pages by searching for the longest common string.
-        For headers we only search in the first n_chars characters (for footer: last n_chars).
-        Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
-         but won't detect "Page 3 of 4" or similar.
-
-        :param pages: list of strings, one string per page
-        :param n_chars: number of first/last characters where the header/footer shall be searched in
-        :param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
-        :param n_last_pages_to_ignore: number of last pages to ignore
-        :return: (cleaned pages, found_header_str, found_footer_str)
-        """
-
-        # header
-        start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
-        found_header = self.find_longest_common_ngram(start_of_pages)
-        if found_header:
-            pages = [page.replace(found_header, "") for page in pages]
-
-        # footer
-        end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
-        found_footer = self.find_longest_common_ngram(end_of_pages)
-        if found_footer:
-            pages = [page.replace(found_footer, "") for page in pages]
-        return pages, found_header, found_footer
--- a/haystack/indexing/file_converters/txt.py
+++ b/haystack/indexing/file_converters/txt.py
@ -0,0 +1,93 @@
+import logging
+import re
+from pathlib import Path
+from typing import List, Optional
+
+from haystack.indexing.file_converters.base import BaseConverter
+
+logger = logging.getLogger(__name__)
+
+
+class TextConverter(BaseConverter):
+    def __init__(
+            self,
+            remove_numeric_tables: Optional[bool] = False,
+            remove_whitespace: Optional[bool] = None,
+            remove_empty_lines: Optional[bool] = None,
+            remove_header_footer: Optional[bool] = None,
+            valid_languages: Optional[List[str]] = None,
+    ):
+        """
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param remove_whitespace: strip whitespaces before or after each line in the text.
+        :param remove_empty_lines: remove more than two empty lines in the text.
+        :param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
+                                     for the longest common string. This heuristic uses exact matches and therefore
+                                     works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
+                                     or similar.
+        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
+                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.
+        """
+
+        super().__init__(
+            remove_numeric_tables=remove_numeric_tables,
+            remove_whitespace=remove_whitespace,
+            remove_empty_lines=remove_empty_lines,
+            remove_header_footer=remove_header_footer,
+            valid_languages=valid_languages,
+        )
+
+    def extract_pages(self, file_path: Path) -> List[str]:
+        with open(file_path) as f:
+            text = f.read()
+            pages = text.split("\f")
+
+        cleaned_pages = []
+        for page in pages:
+            lines = page.splitlines()
+            cleaned_lines = []
+            for line in lines:
+                words = line.split()
+                digits = [word for word in words if any(i.isdigit() for i in word)]
+
+                # remove lines having > 40% of words as digits AND not ending with a period(.)
+                if self.remove_numeric_tables:
+                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
+                        logger.debug(f"Removing line '{line}' from {file_path}")
+                        continue
+
+                if self.remove_whitespace:
+                    line = line.strip()
+
+                cleaned_lines.append(line)
+
+            page = "\n".join(cleaned_lines)
+
+            if self.remove_empty_lines:
+                page = re.sub(r"\n\n+", "\n\n", page)
+
+            cleaned_pages.append(page)
+
+        if self.valid_languages:
+            document_text = "".join(cleaned_pages)
+            if not self.validate_language(document_text):
+                logger.warning(
+                    f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
+                    f"been decoded in the correct text format."
+                )
+
+        if self.remove_header_footer:
+            cleaned_pages, header, footer = self.find_and_remove_header_footer(
+                cleaned_pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
+            )
+            logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
+
+        return cleaned_pages
+
--- a/haystack/indexing/utils.py
+++ b/haystack/indexing/utils.py
@ -7,7 +7,7 @@ from typing import Callable, List, Optional

 from farm.data_handler.utils import http_get

-from haystack.indexing.file_converters.pdftotext import PDFToTextConverter
+from haystack.indexing.file_converters.pdf import PDFToTextConverter

 logger = logging.getLogger(__name__)

--- a/test/test_pdf_conversion.py
+++ b/test/test_pdf_conversion.py
@ -1,7 +1,7 @@
 import logging
 from pathlib import Path

-from haystack.indexing.file_converters.pdftotext import PDFToTextConverter
+from haystack.indexing.file_converters.pdf import PDFToTextConverter

 logger = logging.getLogger(__name__)