mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-29 02:46:39 +00:00
Add API endpoint to upload files (#154)
This commit is contained in:
parent
8bcc4b26a1
commit
a349eef0db
@ -41,6 +41,16 @@ if EXCLUDE_META_DATA_FIELDS:
|
|||||||
EMBEDDING_MODEL_PATH = os.getenv("EMBEDDING_MODEL_PATH", None)
|
EMBEDDING_MODEL_PATH = os.getenv("EMBEDDING_MODEL_PATH", None)
|
||||||
EMBEDDING_MODEL_FORMAT = os.getenv("EMBEDDING_MODEL_FORMAT", "farm")
|
EMBEDDING_MODEL_FORMAT = os.getenv("EMBEDDING_MODEL_FORMAT", "farm")
|
||||||
|
|
||||||
|
# File uploads
|
||||||
|
FILE_UPLOAD_PATH = os.getenv("FILE_UPLOAD_PATH", "file-uploads")
|
||||||
|
REMOVE_NUMERIC_TABLES = os.getenv("REMOVE_NUMERIC_TABLES", "True").lower() == "true"
|
||||||
|
REMOVE_WHITESPACE = os.getenv("REMOVE_WHITESPACE", "True").lower() == "true"
|
||||||
|
REMOVE_EMPTY_LINES = os.getenv("REMOVE_EMPTY_LINES", "True").lower() == "true"
|
||||||
|
REMOVE_HEADER_FOOTER = os.getenv("REMOVE_HEADER_FOOTER", "True").lower() == "true"
|
||||||
|
VALID_LANGUAGES = os.getenv("VALID_LANGUAGES", None)
|
||||||
|
if VALID_LANGUAGES:
|
||||||
|
VALID_LANGUAGES = ast.literal_eval(VALID_LANGUAGES)
|
||||||
|
|
||||||
# Monitoring
|
# Monitoring
|
||||||
APM_SERVER = os.getenv("APM_SERVER", None)
|
APM_SERVER = os.getenv("APM_SERVER", None)
|
||||||
APM_SERVICE_NAME = os.getenv("APM_SERVICE_NAME", "haystack-backend")
|
APM_SERVICE_NAME = os.getenv("APM_SERVICE_NAME", "haystack-backend")
|
||||||
|
80
haystack/api/controller/file_upload.py
Normal file
80
haystack/api/controller/file_upload.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, List
|
||||||
|
|
||||||
|
from fastapi import APIRouter
|
||||||
|
from fastapi import HTTPException
|
||||||
|
from fastapi import UploadFile, File, Form
|
||||||
|
|
||||||
|
from haystack.api.config import DB_HOST, DB_PORT, DB_USER, DB_PW, DB_INDEX, ES_CONN_SCHEME, TEXT_FIELD_NAME, \
|
||||||
|
SEARCH_FIELD_NAME, FILE_UPLOAD_PATH, EMBEDDING_DIM, EMBEDDING_FIELD_NAME, EXCLUDE_META_DATA_FIELDS, VALID_LANGUAGES, \
|
||||||
|
FAQ_QUESTION_FIELD_NAME, REMOVE_NUMERIC_TABLES, REMOVE_WHITESPACE, REMOVE_EMPTY_LINES, REMOVE_HEADER_FOOTER
|
||||||
|
from haystack.database.elasticsearch import ElasticsearchDocumentStore
|
||||||
|
from haystack.indexing.file_converters.pdf import PDFToTextConverter
|
||||||
|
from haystack.indexing.file_converters.txt import TextConverter
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
document_store = ElasticsearchDocumentStore(
|
||||||
|
host=DB_HOST,
|
||||||
|
port=DB_PORT,
|
||||||
|
username=DB_USER,
|
||||||
|
password=DB_PW,
|
||||||
|
index=DB_INDEX,
|
||||||
|
scheme=ES_CONN_SCHEME,
|
||||||
|
ca_certs=False,
|
||||||
|
verify_certs=False,
|
||||||
|
text_field=TEXT_FIELD_NAME,
|
||||||
|
search_fields=SEARCH_FIELD_NAME,
|
||||||
|
embedding_dim=EMBEDDING_DIM,
|
||||||
|
embedding_field=EMBEDDING_FIELD_NAME,
|
||||||
|
excluded_meta_data=EXCLUDE_META_DATA_FIELDS, # type: ignore
|
||||||
|
faq_question_field=FAQ_QUESTION_FIELD_NAME,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/file-upload")
|
||||||
|
def upload_file_to_document_store(
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES),
|
||||||
|
remove_whitespace: Optional[bool] = Form(REMOVE_WHITESPACE),
|
||||||
|
remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES),
|
||||||
|
remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER),
|
||||||
|
valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES),
|
||||||
|
) -> None:
|
||||||
|
try:
|
||||||
|
file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
|
||||||
|
with file_path.open("wb") as buffer:
|
||||||
|
shutil.copyfileobj(file.file, buffer)
|
||||||
|
|
||||||
|
if file.filename.split(".")[-1].lower() == "pdf":
|
||||||
|
pdf_converter = PDFToTextConverter(
|
||||||
|
remove_numeric_tables=remove_numeric_tables,
|
||||||
|
remove_whitespace=remove_whitespace,
|
||||||
|
remove_empty_lines=remove_empty_lines,
|
||||||
|
remove_header_footer=remove_header_footer,
|
||||||
|
valid_languages=valid_languages,
|
||||||
|
)
|
||||||
|
pages = pdf_converter.extract_pages(file_path)
|
||||||
|
elif file.filename.split(".")[-1].lower() == "txt":
|
||||||
|
txt_converter = TextConverter(
|
||||||
|
remove_numeric_tables=remove_numeric_tables,
|
||||||
|
remove_whitespace=remove_whitespace,
|
||||||
|
remove_empty_lines=remove_empty_lines,
|
||||||
|
remove_header_footer=remove_header_footer,
|
||||||
|
valid_languages=valid_languages,
|
||||||
|
)
|
||||||
|
pages = txt_converter.extract_pages(file_path)
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")
|
||||||
|
|
||||||
|
document = {TEXT_FIELD_NAME: "\n".join(pages), "name": file.filename}
|
||||||
|
document_store.write_documents([document])
|
||||||
|
|
||||||
|
finally:
|
||||||
|
file.file.close()
|
@ -1,8 +1,9 @@
|
|||||||
from fastapi import APIRouter
|
from fastapi import APIRouter
|
||||||
|
|
||||||
from haystack.api.controller import search, feedback
|
from haystack.api.controller import search, feedback, file_upload
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
router.include_router(search.router, tags=["search"])
|
router.include_router(search.router, tags=["search"])
|
||||||
router.include_router(feedback.router, tags=["feedback"])
|
router.include_router(feedback.router, tags=["feedback"])
|
||||||
|
router.include_router(file_upload.router, tags=["file-upload"])
|
||||||
|
@ -1,6 +1,10 @@
|
|||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
|
from functools import partial, reduce
|
||||||
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional
|
from typing import List, Optional, Tuple, Generator, Set
|
||||||
|
|
||||||
|
import langdetect
|
||||||
|
|
||||||
|
|
||||||
class BaseConverter:
|
class BaseConverter:
|
||||||
@ -43,3 +47,97 @@ class BaseConverter:
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def extract_pages(self, file_path: Path) -> List[str]:
|
def extract_pages(self, file_path: Path) -> List[str]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def validate_language(self, text: str) -> bool:
|
||||||
|
"""
|
||||||
|
Validate if the language of the text is one of valid languages.
|
||||||
|
"""
|
||||||
|
if not self.valid_languages:
|
||||||
|
return True
|
||||||
|
|
||||||
|
try:
|
||||||
|
lang = langdetect.detect(text)
|
||||||
|
except langdetect.lang_detect_exception.LangDetectException:
|
||||||
|
lang = None
|
||||||
|
|
||||||
|
if lang in self.valid_languages:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def find_and_remove_header_footer(
|
||||||
|
self, pages: List[str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
|
||||||
|
) -> Tuple[List[str], Optional[str], Optional[str]]:
|
||||||
|
"""
|
||||||
|
Heuristic to find footers and headers across different pages by searching for the longest common string.
|
||||||
|
For headers we only search in the first n_chars characters (for footer: last n_chars).
|
||||||
|
Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
|
||||||
|
but won't detect "Page 3 of 4" or similar.
|
||||||
|
|
||||||
|
:param pages: list of strings, one string per page
|
||||||
|
:param n_chars: number of first/last characters where the header/footer shall be searched in
|
||||||
|
:param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
|
||||||
|
:param n_last_pages_to_ignore: number of last pages to ignore
|
||||||
|
:return: (cleaned pages, found_header_str, found_footer_str)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# header
|
||||||
|
start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
|
||||||
|
found_header = self._find_longest_common_ngram(start_of_pages)
|
||||||
|
if found_header:
|
||||||
|
pages = [page.replace(found_header, "") for page in pages]
|
||||||
|
|
||||||
|
# footer
|
||||||
|
end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
|
||||||
|
found_footer = self._find_longest_common_ngram(end_of_pages)
|
||||||
|
if found_footer:
|
||||||
|
pages = [page.replace(found_footer, "") for page in pages]
|
||||||
|
return pages, found_header, found_footer
|
||||||
|
|
||||||
|
def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
|
||||||
|
"""
|
||||||
|
Return ngram (of tokens - currently splitted by whitespace)
|
||||||
|
:param seq: str, string from which the ngram shall be created
|
||||||
|
:param n: int, n of ngram
|
||||||
|
:return: str, ngram as string
|
||||||
|
"""
|
||||||
|
|
||||||
|
# In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
|
||||||
|
# we add a space here and remove it after creation of the ngrams again (see below)
|
||||||
|
seq = seq.replace("\n", " \n")
|
||||||
|
seq = seq.replace("\t", " \t")
|
||||||
|
|
||||||
|
words = seq.split(" ")
|
||||||
|
ngrams = (
|
||||||
|
" ".join(words[i: i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
return ngrams
|
||||||
|
|
||||||
|
def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
|
||||||
|
lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
|
||||||
|
ngrams = map(partial(self._ngram, seq), lengths)
|
||||||
|
res = set(chain.from_iterable(ngrams))
|
||||||
|
return res
|
||||||
|
|
||||||
|
def _find_longest_common_ngram(self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Find the longest common ngram across different text sequences (e.g. start of pages).
|
||||||
|
Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
|
||||||
|
|
||||||
|
:param sequences: list[str], list of strings that shall be searched for common n_grams
|
||||||
|
:param max_ngram: int, maximum length of ngram to consider
|
||||||
|
:param min_ngram: minimum length of ngram to consider
|
||||||
|
:return: str, common string of all sections
|
||||||
|
"""
|
||||||
|
|
||||||
|
seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
|
||||||
|
intersection = reduce(set.intersection, seqs_ngrams)
|
||||||
|
|
||||||
|
try:
|
||||||
|
longest = max(intersection, key=len)
|
||||||
|
except ValueError:
|
||||||
|
# no common sequence found
|
||||||
|
longest = ""
|
||||||
|
return longest if longest.strip() else None
|
||||||
|
|
||||||
|
@ -1,12 +1,8 @@
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
from functools import partial, reduce
|
|
||||||
from itertools import chain
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Tuple, Generator, Set
|
from typing import List, Optional
|
||||||
|
|
||||||
import langdetect
|
|
||||||
|
|
||||||
from haystack.indexing.file_converters.base import BaseConverter
|
from haystack.indexing.file_converters.base import BaseConverter
|
||||||
|
|
||||||
@ -106,7 +102,7 @@ class PDFToTextConverter(BaseConverter):
|
|||||||
|
|
||||||
if self.valid_languages:
|
if self.valid_languages:
|
||||||
document_text = "".join(cleaned_pages)
|
document_text = "".join(cleaned_pages)
|
||||||
if not self._validate_language(document_text):
|
if not self.validate_language(document_text):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
|
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
|
||||||
f"been decoded in the correct text format."
|
f"been decoded in the correct text format."
|
||||||
@ -138,95 +134,3 @@ class PDFToTextConverter(BaseConverter):
|
|||||||
pages = pages[:-1] # the last page in the split is always empty.
|
pages = pages[:-1] # the last page in the split is always empty.
|
||||||
return pages
|
return pages
|
||||||
|
|
||||||
def _validate_language(self, text: str) -> bool:
|
|
||||||
"""
|
|
||||||
Validate if the language of the text is one of valid languages.
|
|
||||||
"""
|
|
||||||
if not self.valid_languages:
|
|
||||||
return True
|
|
||||||
|
|
||||||
try:
|
|
||||||
lang = langdetect.detect(text)
|
|
||||||
except langdetect.lang_detect_exception.LangDetectException:
|
|
||||||
lang = None
|
|
||||||
|
|
||||||
if lang in self.valid_languages:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
|
|
||||||
"""
|
|
||||||
Return ngram (of tokens - currently splitted by whitespace)
|
|
||||||
:param seq: str, string from which the ngram shall be created
|
|
||||||
:param n: int, n of ngram
|
|
||||||
:return: str, ngram as string
|
|
||||||
"""
|
|
||||||
|
|
||||||
# In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
|
|
||||||
# we add a space here and remove it after creation of the ngrams again (see below)
|
|
||||||
seq = seq.replace("\n", " \n")
|
|
||||||
seq = seq.replace("\t", " \t")
|
|
||||||
|
|
||||||
words = seq.split(" ")
|
|
||||||
ngrams = (
|
|
||||||
" ".join(words[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
|
|
||||||
)
|
|
||||||
|
|
||||||
return ngrams
|
|
||||||
|
|
||||||
def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
|
|
||||||
lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
|
|
||||||
ngrams = map(partial(self._ngram, seq), lengths)
|
|
||||||
res = set(chain.from_iterable(ngrams))
|
|
||||||
return res
|
|
||||||
|
|
||||||
def find_longest_common_ngram(self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3) -> Optional[str]:
|
|
||||||
"""
|
|
||||||
Find the longest common ngram across different text sequences (e.g. start of pages).
|
|
||||||
Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
|
|
||||||
|
|
||||||
:param sequences: list[str], list of strings that shall be searched for common n_grams
|
|
||||||
:param max_ngram: int, maximum length of ngram to consider
|
|
||||||
:param min_ngram: minimum length of ngram to consider
|
|
||||||
:return: str, common string of all sections
|
|
||||||
"""
|
|
||||||
|
|
||||||
seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
|
|
||||||
intersection = reduce(set.intersection, seqs_ngrams)
|
|
||||||
|
|
||||||
try:
|
|
||||||
longest = max(intersection, key=len)
|
|
||||||
except ValueError:
|
|
||||||
# no common sequence found
|
|
||||||
longest = ""
|
|
||||||
return longest if longest.strip() else None
|
|
||||||
|
|
||||||
def find_and_remove_header_footer(
|
|
||||||
self, pages: List[str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
|
|
||||||
) -> Tuple[List[str], Optional[str], Optional[str]]:
|
|
||||||
"""
|
|
||||||
Heuristic to find footers and headers across different pages by searching for the longest common string.
|
|
||||||
For headers we only search in the first n_chars characters (for footer: last n_chars).
|
|
||||||
Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
|
|
||||||
but won't detect "Page 3 of 4" or similar.
|
|
||||||
|
|
||||||
:param pages: list of strings, one string per page
|
|
||||||
:param n_chars: number of first/last characters where the header/footer shall be searched in
|
|
||||||
:param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
|
|
||||||
:param n_last_pages_to_ignore: number of last pages to ignore
|
|
||||||
:return: (cleaned pages, found_header_str, found_footer_str)
|
|
||||||
"""
|
|
||||||
|
|
||||||
# header
|
|
||||||
start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
|
|
||||||
found_header = self.find_longest_common_ngram(start_of_pages)
|
|
||||||
if found_header:
|
|
||||||
pages = [page.replace(found_header, "") for page in pages]
|
|
||||||
|
|
||||||
# footer
|
|
||||||
end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
|
|
||||||
found_footer = self.find_longest_common_ngram(end_of_pages)
|
|
||||||
if found_footer:
|
|
||||||
pages = [page.replace(found_footer, "") for page in pages]
|
|
||||||
return pages, found_header, found_footer
|
|
93
haystack/indexing/file_converters/txt.py
Normal file
93
haystack/indexing/file_converters/txt.py
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from haystack.indexing.file_converters.base import BaseConverter
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TextConverter(BaseConverter):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
remove_numeric_tables: Optional[bool] = False,
|
||||||
|
remove_whitespace: Optional[bool] = None,
|
||||||
|
remove_empty_lines: Optional[bool] = None,
|
||||||
|
remove_header_footer: Optional[bool] = None,
|
||||||
|
valid_languages: Optional[List[str]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||||
|
The tabular structures in documents might be noise for the reader model if it
|
||||||
|
does not have table parsing capability for finding answers. However, tables
|
||||||
|
may also have long strings that could possible candidate for searching answers.
|
||||||
|
The rows containing strings are thus retained in this option.
|
||||||
|
:param remove_whitespace: strip whitespaces before or after each line in the text.
|
||||||
|
:param remove_empty_lines: remove more than two empty lines in the text.
|
||||||
|
:param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
|
||||||
|
for the longest common string. This heuristic uses exact matches and therefore
|
||||||
|
works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
|
||||||
|
or similar.
|
||||||
|
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
||||||
|
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||||
|
This option can be used to add test for encoding errors. If the extracted text is
|
||||||
|
not one of the valid languages, then it might likely be encoding error resulting
|
||||||
|
in garbled text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
super().__init__(
|
||||||
|
remove_numeric_tables=remove_numeric_tables,
|
||||||
|
remove_whitespace=remove_whitespace,
|
||||||
|
remove_empty_lines=remove_empty_lines,
|
||||||
|
remove_header_footer=remove_header_footer,
|
||||||
|
valid_languages=valid_languages,
|
||||||
|
)
|
||||||
|
|
||||||
|
def extract_pages(self, file_path: Path) -> List[str]:
|
||||||
|
with open(file_path) as f:
|
||||||
|
text = f.read()
|
||||||
|
pages = text.split("\f")
|
||||||
|
|
||||||
|
cleaned_pages = []
|
||||||
|
for page in pages:
|
||||||
|
lines = page.splitlines()
|
||||||
|
cleaned_lines = []
|
||||||
|
for line in lines:
|
||||||
|
words = line.split()
|
||||||
|
digits = [word for word in words if any(i.isdigit() for i in word)]
|
||||||
|
|
||||||
|
# remove lines having > 40% of words as digits AND not ending with a period(.)
|
||||||
|
if self.remove_numeric_tables:
|
||||||
|
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
|
||||||
|
logger.debug(f"Removing line '{line}' from {file_path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if self.remove_whitespace:
|
||||||
|
line = line.strip()
|
||||||
|
|
||||||
|
cleaned_lines.append(line)
|
||||||
|
|
||||||
|
page = "\n".join(cleaned_lines)
|
||||||
|
|
||||||
|
if self.remove_empty_lines:
|
||||||
|
page = re.sub(r"\n\n+", "\n\n", page)
|
||||||
|
|
||||||
|
cleaned_pages.append(page)
|
||||||
|
|
||||||
|
if self.valid_languages:
|
||||||
|
document_text = "".join(cleaned_pages)
|
||||||
|
if not self.validate_language(document_text):
|
||||||
|
logger.warning(
|
||||||
|
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
|
||||||
|
f"been decoded in the correct text format."
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.remove_header_footer:
|
||||||
|
cleaned_pages, header, footer = self.find_and_remove_header_footer(
|
||||||
|
cleaned_pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
|
||||||
|
)
|
||||||
|
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
|
||||||
|
|
||||||
|
return cleaned_pages
|
||||||
|
|
@ -7,7 +7,7 @@ from typing import Callable, List, Optional
|
|||||||
|
|
||||||
from farm.data_handler.utils import http_get
|
from farm.data_handler.utils import http_get
|
||||||
|
|
||||||
from haystack.indexing.file_converters.pdftotext import PDFToTextConverter
|
from haystack.indexing.file_converters.pdf import PDFToTextConverter
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from haystack.indexing.file_converters.pdftotext import PDFToTextConverter
|
from haystack.indexing.file_converters.pdf import PDFToTextConverter
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user