mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-19 14:08:19 +00:00
Change default encoding for PDFToTextConverter
from Latin 1
to UTF-8
(#2420)
* Change default encoding for PDFToTextConverter * Update Documentation & Code Style * Improve docstring * Update Documentation & Code Style * Add list of ligatures to ignore and add the possibility to modify such list at need * Add docstring * Add tests * Rename parameter * Update Documentation & Code Style * Move implementation into the base converter to make mypy happier * Update Documentation & Code Style * mypy and pylint * mypy * move encoding parameter to init of PDFToTextConverter * Update Documentation & Code Style * make utf8 default and fix mypy * Update Documentation & Code Style * Update Documentation & Code Style * remove note on encoding in tutorial8 * Update Documentation & Code Style * skip OCRConverter and test converter.run * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai>
This commit is contained in:
parent
a4e603ce87
commit
01ea4bf21f
@ -43,7 +43,7 @@ In this case the id will be generated by using the content and the defined metad
|
||||
|
||||
```python
|
||||
@abstractmethod
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Convert a file to a dictionary containing the text and any associated meta data.
|
||||
@ -65,7 +65,7 @@ The rows containing strings are thus retained in this option.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
- `encoding`: Select the file encoding (default is `utf-8`)
|
||||
- `encoding`: Select the file encoding (default is `UTF-8`)
|
||||
- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
@ -81,6 +81,40 @@ def validate_language(text: str, valid_languages: Optional[List[str]] = None) ->
|
||||
|
||||
Validate if the language of the text is one of valid languages.
|
||||
|
||||
<a id="base.BaseConverter.run"></a>
|
||||
|
||||
#### run
|
||||
|
||||
```python
|
||||
def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8")
|
||||
```
|
||||
|
||||
Extract text from a file.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `file_paths`: Path to the files you want to convert
|
||||
- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents.
|
||||
Can be any custom keys and values.
|
||||
- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for the reader model if it
|
||||
does not have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could possible candidate for searching answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
- `known_ligatures`: Some converters tends to recognize clusters of letters as ligatures, such as "ff" (double f).
|
||||
Such ligatures however make text hard to compare with the content of other files,
|
||||
which are generally ligature free. Therefore we automatically find and replace the most
|
||||
common ligatures with their split counterparts. The default mapping is in
|
||||
`haystack.nodes.file_converter.base.KNOWN_LIGATURES`: it is rather biased towards Latin alphabeths
|
||||
but excludes all ligatures that are known to be used in IPA.
|
||||
You can use this parameter to provide your own set of ligatures to clean up from the documents.
|
||||
- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
|
||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
- `encoding`: Select the file encoding (default is `UTF-8`)
|
||||
|
||||
<a id="docx"></a>
|
||||
|
||||
# Module docx
|
||||
@ -261,7 +295,7 @@ class PDFToTextConverter(BaseConverter)
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None)
|
||||
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8")
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -280,13 +314,16 @@ in garbled text.
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
- `encoding`: Encoding that will be passed as `-enc` parameter to `pdftotext`.
|
||||
Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
|
||||
(See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
|
||||
|
||||
<a id="pdf.PDFToTextConverter.convert"></a>
|
||||
|
||||
#### convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "Latin1", id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
|
||||
@ -306,11 +343,7 @@ The rows containing strings are thus retained in this option.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
- `encoding`: Encoding that will be passed as -enc parameter to pdftotext. "Latin 1" is the default encoding
|
||||
of pdftotext. While this works well on many PDFs, it might be needed to switch to "UTF-8" or
|
||||
others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...).
|
||||
Note: With "UTF-8" we experienced cases, where a simple "fi" gets wrongly parsed as
|
||||
"xef\xac\x81c" (see test cases). That's why we keep "Latin 1" as default here.
|
||||
- `encoding`: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
|
||||
(See list of available encodings by running `pdftotext -listenc` in the terminal)
|
||||
- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
@ -357,7 +390,7 @@ In this case the id will be generated by using the content and the defined metad
|
||||
#### convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Convert a file to a dictionary containing the text and any associated meta data.
|
||||
@ -379,7 +412,7 @@ The rows containing strings are thus retained in this option.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
- `encoding`: Select the file encoding (default is `utf-8`)
|
||||
- `encoding`: Select the file encoding (default is `UTF-8`)
|
||||
- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
|
@ -67,7 +67,6 @@ Haystack's converter classes are designed to help you turn files on your compute
|
||||
that can be processed by the Haystack pipeline.
|
||||
There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.
|
||||
The parameter `valid_langugages` does not convert files to the target language, but checks if the conversion worked as expected.
|
||||
For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't great.
|
||||
|
||||
|
||||
```python
|
||||
|
@ -2375,6 +2375,11 @@
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"encoding": {
|
||||
"title": "Encoding",
|
||||
"default": "UTF-8",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -10,6 +10,15 @@
|
||||
"description": "Version of the Haystack Pipeline file.",
|
||||
"type": "string",
|
||||
"oneOf": [
|
||||
{
|
||||
"const": "unstable"
|
||||
},
|
||||
{
|
||||
"const": "1.2.1rc0"
|
||||
},
|
||||
{
|
||||
"const": "1.3.0"
|
||||
},
|
||||
{
|
||||
"const": "1.3.1rc0"
|
||||
}
|
||||
@ -287,6 +296,11 @@
|
||||
"title": "Return Embedding",
|
||||
"default": false,
|
||||
"type": "boolean"
|
||||
},
|
||||
"label_index": {
|
||||
"title": "Label Index",
|
||||
"default": "default",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
@ -1309,6 +1323,13 @@
|
||||
"title": "Merge Multiple Column Headers",
|
||||
"default": true,
|
||||
"type": "boolean"
|
||||
},
|
||||
"id_hash_keys": {
|
||||
"title": "Id Hash Keys",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
@ -1367,6 +1388,13 @@
|
||||
"overwrite_existing_files": {
|
||||
"title": "Overwrite Existing Files",
|
||||
"default": true
|
||||
},
|
||||
"id_hash_keys": {
|
||||
"title": "Id Hash Keys",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
@ -1507,10 +1535,10 @@
|
||||
"title": "Use Auth Token",
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "boolean"
|
||||
"type": "string"
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -1585,6 +1613,13 @@
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"id_hash_keys": {
|
||||
"title": "Id Hash Keys",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
@ -1785,10 +1820,10 @@
|
||||
"title": "Use Auth Token",
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "boolean"
|
||||
"type": "string"
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -1979,6 +2014,14 @@
|
||||
"default": true,
|
||||
"type": "boolean"
|
||||
},
|
||||
"devices": {
|
||||
"title": "Devices",
|
||||
"default": [],
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"no_ans_boost": {
|
||||
"title": "No Ans Boost",
|
||||
"default": 0.0,
|
||||
@ -2033,6 +2076,10 @@
|
||||
"default": true,
|
||||
"type": "boolean"
|
||||
},
|
||||
"confidence_threshold": {
|
||||
"title": "Confidence Threshold",
|
||||
"type": "number"
|
||||
},
|
||||
"proxies": {
|
||||
"title": "Proxies",
|
||||
"type": "object",
|
||||
@ -2052,10 +2099,10 @@
|
||||
"title": "Use Auth Token",
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "boolean"
|
||||
"type": "string"
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -2148,6 +2195,13 @@
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"id_hash_keys": {
|
||||
"title": "Id Hash Keys",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
@ -2279,6 +2333,13 @@
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"id_hash_keys": {
|
||||
"title": "Id Hash Keys",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
@ -2320,6 +2381,18 @@
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"id_hash_keys": {
|
||||
"title": "Id Hash Keys",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"encoding": {
|
||||
"title": "Encoding",
|
||||
"default": "UTF-8",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
@ -2364,6 +2437,13 @@
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"id_hash_keys": {
|
||||
"title": "Id Hash Keys",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
@ -2448,6 +2528,13 @@
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"id_hash_keys": {
|
||||
"title": "Id Hash Keys",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
@ -2525,6 +2612,13 @@
|
||||
"title": "Language",
|
||||
"default": "en",
|
||||
"type": "string"
|
||||
},
|
||||
"id_hash_keys": {
|
||||
"title": "Id Hash Keys",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
@ -3206,10 +3300,10 @@
|
||||
"title": "Use Auth Token",
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "boolean"
|
||||
"type": "string"
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -3300,6 +3394,13 @@
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"id_hash_keys": {
|
||||
"title": "Id Hash Keys",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
@ -3391,6 +3492,13 @@
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"id_hash_keys": {
|
||||
"title": "Id Hash Keys",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -2883,6 +2883,11 @@
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"encoding": {
|
||||
"title": "Encoding",
|
||||
"default": "UTF-8",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -8,6 +8,39 @@ from haystack.nodes.base import BaseComponent
|
||||
from haystack.schema import Document
|
||||
|
||||
|
||||
# https://en.wikipedia.org/wiki/Ligature_(writing)
|
||||
KNOWN_LIGATURES = {
|
||||
# Latin
|
||||
"ff": "ff",
|
||||
"fi": "fi",
|
||||
"fl": "fl",
|
||||
"ffi": "ffi",
|
||||
"ffl": "ffl",
|
||||
"ſt": "ft",
|
||||
"st": "st",
|
||||
"DZ": "DZ",
|
||||
"Dz": "Dz",
|
||||
"dz": "dz",
|
||||
"DŽ": "DŽ",
|
||||
"Dž": "Dž",
|
||||
"dž": "dž",
|
||||
"Ꜩ": "Tz",
|
||||
"ꜩ": "tz",
|
||||
"🙰": "et",
|
||||
"℔": "lb",
|
||||
"ᵫ": "ue",
|
||||
"IJ": "IJ",
|
||||
"ij": "ij", # They are both capitalized together, so the "Ij" ligature doesn't exist
|
||||
"ꝏ": "oo", # Not the infinite sign but a double-o ligature: https://en.wikipedia.org/wiki/Ligature_(writing)#Massachusett_%EA%9D%8F
|
||||
# Armenian
|
||||
"ﬓ": "մն",
|
||||
"ﬔ": "մե",
|
||||
"ﬕ": "մի",
|
||||
"ﬖ": "վն",
|
||||
"ﬗ": "մխ",
|
||||
}
|
||||
|
||||
|
||||
class BaseConverter(BaseComponent):
|
||||
"""
|
||||
Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore.
|
||||
@ -50,7 +83,7 @@ class BaseConverter(BaseComponent):
|
||||
meta: Optional[Dict[str, str]],
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
encoding: Optional[str] = "UTF-8",
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
) -> List[Document]:
|
||||
"""
|
||||
@ -71,7 +104,7 @@ class BaseConverter(BaseComponent):
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
:param encoding: Select the file encoding (default is `utf-8`)
|
||||
:param encoding: Select the file encoding (default is `UTF-8`)
|
||||
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
@ -98,17 +131,44 @@ class BaseConverter(BaseComponent):
|
||||
|
||||
def run( # type: ignore
|
||||
self,
|
||||
file_paths: Union[Path, List[Path]], # type: ignore
|
||||
meta: Optional[Union[Dict[str, str], List[Dict[str, str]]]] = None, # type: ignore
|
||||
remove_numeric_tables: Optional[bool] = None, # type: ignore
|
||||
valid_languages: Optional[List[str]] = None, # type: ignore
|
||||
file_paths: Union[Path, List[Path]],
|
||||
meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "UTF-8",
|
||||
):
|
||||
"""
|
||||
Extract text from a file.
|
||||
|
||||
:param file_paths: Path to the files you want to convert
|
||||
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
|
||||
Can be any custom keys and values.
|
||||
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for the reader model if it
|
||||
does not have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could possible candidate for searching answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
:param known_ligatures: Some converters tends to recognize clusters of letters as ligatures, such as "ff" (double f).
|
||||
Such ligatures however make text hard to compare with the content of other files,
|
||||
which are generally ligature free. Therefore we automatically find and replace the most
|
||||
common ligatures with their split counterparts. The default mapping is in
|
||||
`haystack.nodes.file_converter.base.KNOWN_LIGATURES`: it is rather biased towards Latin alphabeths
|
||||
but excludes all ligatures that are known to be used in IPA.
|
||||
You can use this parameter to provide your own set of ligatures to clean up from the documents.
|
||||
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
:param encoding: Select the file encoding (default is `UTF-8`)
|
||||
"""
|
||||
|
||||
if isinstance(file_paths, Path):
|
||||
file_paths = [file_paths]
|
||||
|
||||
if meta is None or isinstance(meta, dict):
|
||||
meta = [meta] * len(file_paths) # type: ignore
|
||||
if isinstance(meta, dict) or meta is None:
|
||||
meta = [meta] * len(file_paths)
|
||||
|
||||
documents: list = []
|
||||
for file_path, file_meta in zip(file_paths, meta):
|
||||
@ -117,8 +177,15 @@ class BaseConverter(BaseComponent):
|
||||
meta=file_meta,
|
||||
remove_numeric_tables=remove_numeric_tables,
|
||||
valid_languages=valid_languages,
|
||||
encoding=encoding,
|
||||
):
|
||||
documents.append(doc)
|
||||
|
||||
# Cleanup ligatures
|
||||
for document in documents:
|
||||
for ligature, letters in known_ligatures.items():
|
||||
if document.content is not None:
|
||||
document.content = document.content.replace(ligature, letters)
|
||||
|
||||
result = {"documents": documents}
|
||||
return result, "output_1"
|
||||
|
@ -27,6 +27,7 @@ class PDFToTextConverter(BaseConverter):
|
||||
remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "UTF-8",
|
||||
):
|
||||
"""
|
||||
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||
@ -43,6 +44,9 @@ class PDFToTextConverter(BaseConverter):
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
:param encoding: Encoding that will be passed as `-enc` parameter to `pdftotext`.
|
||||
Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
|
||||
(See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
|
||||
"""
|
||||
super().__init__(
|
||||
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
|
||||
@ -65,6 +69,7 @@ class PDFToTextConverter(BaseConverter):
|
||||
)
|
||||
|
||||
super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
|
||||
self.encoding = encoding
|
||||
|
||||
def convert(
|
||||
self,
|
||||
@ -72,7 +77,7 @@ class PDFToTextConverter(BaseConverter):
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "Latin1",
|
||||
encoding: Optional[str] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
) -> List[Document]:
|
||||
"""
|
||||
@ -91,11 +96,7 @@ class PDFToTextConverter(BaseConverter):
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
:param encoding: Encoding that will be passed as -enc parameter to pdftotext. "Latin 1" is the default encoding
|
||||
of pdftotext. While this works well on many PDFs, it might be needed to switch to "UTF-8" or
|
||||
others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...).
|
||||
Note: With "UTF-8" we experienced cases, where a simple "fi" gets wrongly parsed as
|
||||
"xef\xac\x81c" (see test cases). That's why we keep "Latin 1" as default here.
|
||||
:param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
|
||||
(See list of available encodings by running `pdftotext -listenc` in the terminal)
|
||||
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
@ -151,19 +152,25 @@ class PDFToTextConverter(BaseConverter):
|
||||
document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
|
||||
return [document]
|
||||
|
||||
def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = "Latin1") -> List[str]:
|
||||
def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = None) -> List[str]:
|
||||
"""
|
||||
Extract pages from the pdf file at file_path.
|
||||
|
||||
:param file_path: path of the pdf file
|
||||
:param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in
|
||||
the content stream order.
|
||||
:param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
|
||||
(See list of available encodings by running `pdftotext -listenc` in the terminal)
|
||||
"""
|
||||
if layout:
|
||||
command = ["pdftotext", "-enc", encoding, "-layout", str(file_path), "-"]
|
||||
else:
|
||||
command = ["pdftotext", "-enc", encoding, str(file_path), "-"]
|
||||
output = subprocess.run(command, stdout=subprocess.PIPE, shell=False) # type: ignore
|
||||
# if layout:
|
||||
# command = ["pdftotext", "-enc", encoding, "-layout", str(file_path), "-"]
|
||||
# else:
|
||||
# command = ["pdftotext", "-enc", encoding, str(file_path), "-"]
|
||||
if not encoding:
|
||||
encoding = self.encoding
|
||||
|
||||
command = f"pdftotext -enc {encoding} {'-layout ' if layout else ''}{str(file_path)} -".split()
|
||||
output = subprocess.run(command, stdout=subprocess.PIPE, shell=False)
|
||||
document = output.stdout.decode(errors="ignore")
|
||||
pages = document.split("\f")
|
||||
pages = pages[:-1] # the last page in the split is always empty.
|
||||
@ -208,7 +215,7 @@ class PDFToTextOCRConverter(BaseConverter):
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
encoding: Optional[str] = "UTF-8",
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
) -> List[Document]:
|
||||
"""
|
||||
@ -229,7 +236,7 @@ class PDFToTextOCRConverter(BaseConverter):
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
:param encoding: Select the file encoding (default is `utf-8`)
|
||||
:param encoding: Select the file encoding (default is `UTF-8`)
|
||||
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
@ -244,11 +251,10 @@ class PDFToTextOCRConverter(BaseConverter):
|
||||
for image in images:
|
||||
temp_img = tempfile.NamedTemporaryFile(dir=os.path.dirname(os.path.realpath(__file__)), suffix=".jpeg")
|
||||
image.save(temp_img.name)
|
||||
pages.append(self.image_2_text.convert(temp_img.name)[0].content)
|
||||
pages.append(self.image_2_text.convert(file_path=temp_img.name, encoding=encoding)[0].content)
|
||||
except Exception as exception:
|
||||
logger.error(f"File {file_path} has an error \n {exception}")
|
||||
|
||||
raw_text = "\f".join(pages)
|
||||
document = Document(content=raw_text, meta=meta, id_hash_keys=id_hash_keys)
|
||||
|
||||
return [document]
|
||||
|
@ -60,8 +60,6 @@ def convert_files_to_docs(
|
||||
documents = []
|
||||
for suffix, paths in suffix2paths.items():
|
||||
for path in paths:
|
||||
if encoding is None and suffix == ".pdf":
|
||||
encoding = "Latin1"
|
||||
logger.info("Converting {}".format(path))
|
||||
# PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
|
||||
document = suffix2converter[suffix].convert(
|
||||
|
@ -35,6 +35,38 @@ def test_convert(Converter):
|
||||
assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Converter", [PDFToTextConverter]) # TODO PDFToTextOCRConverter should pass this test too
|
||||
def test_pdf_encoding(Converter):
|
||||
converter = Converter()
|
||||
|
||||
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
|
||||
assert "ɪ" in document.content
|
||||
|
||||
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", encoding="Latin1")[0]["documents"][0]
|
||||
assert "ɪ" not in document.content
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Converter", [PDFToTextConverter]) # TODO PDFToTextOCRConverter should pass this test too
|
||||
def test_pdf_ligatures(Converter):
|
||||
converter = Converter()
|
||||
|
||||
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
|
||||
assert "ff" not in document.content
|
||||
assert "ɪ" in document.content
|
||||
|
||||
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={})[0]["documents"][
|
||||
0
|
||||
]
|
||||
assert "ff" in document.content
|
||||
assert "ɪ" in document.content
|
||||
|
||||
document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={"ɪ": "i"})[0][
|
||||
"documents"
|
||||
][0]
|
||||
assert "ff" in document.content
|
||||
assert "ɪ" not in document.content
|
||||
|
||||
|
||||
@pytest.mark.tika
|
||||
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
|
||||
def test_table_removal(Converter):
|
||||
|
@ -150,8 +150,7 @@
|
||||
"Haystack's converter classes are designed to help you turn files on your computer into the documents\n",
|
||||
"that can be processed by the Haystack pipeline.\n",
|
||||
"There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.\n",
|
||||
"The parameter `valid_langugages` does not convert files to the target language, but checks if the conversion worked as expected.\n",
|
||||
"For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't great."
|
||||
"The parameter `valid_langugages` does not convert files to the target language, but checks if the conversion worked as expected."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -38,7 +38,6 @@ def tutorial8_preprocessing():
|
||||
that can be processed by the Haystack pipeline.
|
||||
There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.
|
||||
The parameter `valid_langugages` does not convert files to the target language, but checks if the conversion worked as expected.
|
||||
For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't great.
|
||||
"""
|
||||
|
||||
# Here are some examples of how you would use file converters
|
||||
|
Loading…
x
Reference in New Issue
Block a user