Clarify docs for PDF conversion, languages and encodings (#1570)

* Clarify PDF conversion, languages and encodings The parameter name `valid_languages` may be a bit miss-leading from reading only the tutorials. Users may, incorrectly assume that it enforces that the conversions only works for those languages, then it's more of a check. - Provided clarifications in the tutorials to highlight what valid_languages does and that changing the encoding may give better results for their language of choice - Updated the command for `pdftotext` to the correct one * Allow encodings for `convert_files_to_dicts` - Set option of passing encoding to the converters. Trying even for some Latin1 languages, the converter does not do it in a good way. Potential issues is that the encoding defaults to None, which is default for the other converters, but not for the PDFToTextConverter. Could add a check and change the ending to Latin1 for pdf if set to None. Was considering adding it to **kwargs, but since it may be a commonly used feature to be documented, I added it as a keyword argument instead. Would love to hear your input and feedback on in. * Set back PDF default encoding * Update documentation
2025-12-25 05:58:57 +00:00 · 2021-10-11 09:30:12 +02:00 · 2021-10-11 09:30:12 +02:00 · 69a0c9f2ed
commit 69a0c9f2ed
parent dbb32c4f79
4 changed files with 22 additions and 7 deletions
--- a/haystack/file_converter/pdf.py
+++ b/haystack/file_converter/pdf.py
@ -86,7 +86,7 @@ class PDFToTextConverter(BaseConverter):
                         others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...).
                         Note: With "UTF-8" we experienced cases, where a simple "fi" gets wrongly parsed as
                         "xef\xac\x81c" (see test cases). That's why we keep "Latin 1" as default here.
-                         (See list of available encodings by running `pdftotext -listencodings` in the terminal)
+                         (See list of available encodings by running `pdftotext -listenc` in the terminal)
        """

        pages = self._read_pdf(file_path, layout=False, encoding=encoding)
--- a/haystack/preprocessor/utils.py
+++ b/haystack/preprocessor/utils.py
@ -212,8 +212,12 @@ def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PrePro
    return docs, labels, problematic_ids


-def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False) -> \
-        List[dict]:
+def convert_files_to_dicts(
+        dir_path: str, 
+        clean_func: Optional[Callable] = None, 
+        split_paragraphs: bool = False,
+        encoding: Optional[str] = None
+) -> List[dict]:
    """
    Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Python dicts that can be written to a
    Document Store.
@ -221,6 +225,7 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
    :param dir_path: path for the documents to be written to the DocumentStore
    :param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
    :param split_paragraphs: split text in paragraphs.
+    :param encoding: character encoding to use when converting pdf documents.

    :return: None
    """
@ -252,8 +257,14 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
    documents = []
    for suffix, paths in suffix2paths.items():
        for path in paths:
+            if encoding is None and suffix == '.pdf':
+                encoding = "Latin1"
            logger.info('Converting {}'.format(path))
-            document = suffix2converter[suffix].convert(file_path=path, meta=None)
+            document = suffix2converter[suffix].convert(
+                    file_path=path, 
+                    meta=None,
+                    encoding=encoding,
+            )
            text = document["text"]

            if clean_func:
--- a/tutorials/Tutorial8_Preprocessing.ipynb
+++ b/tutorials/Tutorial8_Preprocessing.ipynb
@ -145,7 +145,9 @@
    "\n",
    "Haystack's converter classes are designed to help you turn files on your computer into the documents\n",
    "that can be processed by the Haystack pipeline.\n",
-    "There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika."
+    "There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.\n",
+    "The parameter `valid_langugages` does not convert files to the target language, but checks if the conversion worked as expected.\n",
+    "For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't great."
   ],
   "metadata": {
    "collapsed": false,
@ -518,4 +520,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 0
-}
+}
--- a/tutorials/Tutorial8_Preprocessing.py
+++ b/tutorials/Tutorial8_Preprocessing.py
@ -40,6 +40,8 @@ def tutorial8_preprocessing():
    Haystack's converter classes are designed to help you turn files on your computer into the documents
    that can be processed by the Haystack pipeline.
    There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.
+    The parameter `valid_langugages` does not convert files to the target language, but checks if the conversion worked as expected.
+    For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't great.
    """

    # Here are some examples of how you would use file converters
@ -147,4 +149,4 @@ if __name__ == "__main__":

 # This Haystack script was made with love by deepset in Berlin, Germany
 # Haystack: https://github.com/deepset-ai/haystack
-# deepset: https://deepset.ai/
+# deepset: https://deepset.ai/