mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-25 05:58:57 +00:00
Clarify docs for PDF conversion, languages and encodings (#1570)
* Clarify PDF conversion, languages and encodings The parameter name `valid_languages` may be a bit miss-leading from reading only the tutorials. Users may, incorrectly assume that it enforces that the conversions only works for those languages, then it's more of a check. - Provided clarifications in the tutorials to highlight what valid_languages does and that changing the encoding may give better results for their language of choice - Updated the command for `pdftotext` to the correct one * Allow encodings for `convert_files_to_dicts` - Set option of passing encoding to the converters. Trying even for some Latin1 languages, the converter does not do it in a good way. Potential issues is that the encoding defaults to None, which is default for the other converters, but not for the PDFToTextConverter. Could add a check and change the ending to Latin1 for pdf if set to None. Was considering adding it to **kwargs, but since it may be a commonly used feature to be documented, I added it as a keyword argument instead. Would love to hear your input and feedback on in. * Set back PDF default encoding * Update documentation
This commit is contained in:
parent
dbb32c4f79
commit
69a0c9f2ed
@ -86,7 +86,7 @@ class PDFToTextConverter(BaseConverter):
|
||||
others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...).
|
||||
Note: With "UTF-8" we experienced cases, where a simple "fi" gets wrongly parsed as
|
||||
"xef\xac\x81c" (see test cases). That's why we keep "Latin 1" as default here.
|
||||
(See list of available encodings by running `pdftotext -listencodings` in the terminal)
|
||||
(See list of available encodings by running `pdftotext -listenc` in the terminal)
|
||||
"""
|
||||
|
||||
pages = self._read_pdf(file_path, layout=False, encoding=encoding)
|
||||
|
||||
@ -212,8 +212,12 @@ def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PrePro
|
||||
return docs, labels, problematic_ids
|
||||
|
||||
|
||||
def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False) -> \
|
||||
List[dict]:
|
||||
def convert_files_to_dicts(
|
||||
dir_path: str,
|
||||
clean_func: Optional[Callable] = None,
|
||||
split_paragraphs: bool = False,
|
||||
encoding: Optional[str] = None
|
||||
) -> List[dict]:
|
||||
"""
|
||||
Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Python dicts that can be written to a
|
||||
Document Store.
|
||||
@ -221,6 +225,7 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
|
||||
:param dir_path: path for the documents to be written to the DocumentStore
|
||||
:param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
|
||||
:param split_paragraphs: split text in paragraphs.
|
||||
:param encoding: character encoding to use when converting pdf documents.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
@ -252,8 +257,14 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
|
||||
documents = []
|
||||
for suffix, paths in suffix2paths.items():
|
||||
for path in paths:
|
||||
if encoding is None and suffix == '.pdf':
|
||||
encoding = "Latin1"
|
||||
logger.info('Converting {}'.format(path))
|
||||
document = suffix2converter[suffix].convert(file_path=path, meta=None)
|
||||
document = suffix2converter[suffix].convert(
|
||||
file_path=path,
|
||||
meta=None,
|
||||
encoding=encoding,
|
||||
)
|
||||
text = document["text"]
|
||||
|
||||
if clean_func:
|
||||
|
||||
@ -145,7 +145,9 @@
|
||||
"\n",
|
||||
"Haystack's converter classes are designed to help you turn files on your computer into the documents\n",
|
||||
"that can be processed by the Haystack pipeline.\n",
|
||||
"There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika."
|
||||
"There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.\n",
|
||||
"The parameter `valid_langugages` does not convert files to the target language, but checks if the conversion worked as expected.\n",
|
||||
"For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't great."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
@ -518,4 +520,4 @@
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
}
|
||||
|
||||
@ -40,6 +40,8 @@ def tutorial8_preprocessing():
|
||||
Haystack's converter classes are designed to help you turn files on your computer into the documents
|
||||
that can be processed by the Haystack pipeline.
|
||||
There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.
|
||||
The parameter `valid_langugages` does not convert files to the target language, but checks if the conversion worked as expected.
|
||||
For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't great.
|
||||
"""
|
||||
|
||||
# Here are some examples of how you would use file converters
|
||||
@ -147,4 +149,4 @@ if __name__ == "__main__":
|
||||
|
||||
# This Haystack script was made with love by deepset in Berlin, Germany
|
||||
# Haystack: https://github.com/deepset-ai/haystack
|
||||
# deepset: https://deepset.ai/
|
||||
# deepset: https://deepset.ai/
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user