chore: function to map between standard and Tesseract language codes (#1421)

### Summary
In order to convert between incompatible language codes from packages
used for OCR, this change adds a function to map between any standard
language codes and tesseract OCR specific codes. Users can input
language information to `languages` in any Tesseract-supported langcode
or any ISO 639 standard language code.

### Details
- Introduces the
[python-iso639](https://pypi.org/project/python-iso639/) package for
matching standard language codes. Recompiles all dependencies.
- If a language is not already supplied by the user as a Tesseract
specific langcode, supplies all possible script/orthography variants of
the language to the Tesseract OCR agent.

### Test
Added many unit tests for a variety of language combinations, special
cases, and variants. For general testing, call partition functions with
any lang codes in the languages parameter (Tesseract or standard).

for example,
```
from unstructured.partition.auto import partition

elements = partition(filename="example-docs/layout-parser-paper.pdf", strategy="hi_res", languages=["en", "chi"])
print("\n\n".join([str(el) for el in elements]))
```
should supply eng+chi_sim+chi_sim_vert+chi_tra+chi_tra_vert to Tesseract
This commit is contained in:
shreyanid 2023-09-18 11:42:02 -04:00 committed by GitHub
parent 3a07d1e6b4
commit eb8ce89137
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 292 additions and 12 deletions

View File

@ -2,6 +2,8 @@
### Enhancements
* **Add a function to map between Tesseract and standard language codes.** This allows users to input language information to the `languages` param in any Tesseract-supported langcode or any ISO 639 standard language code.
### Features
### Fixes

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

BIN
example-docs/jpn-vert.jpeg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

View File

@ -9,3 +9,4 @@ requests
beautifulsoup4
emoji
dataclasses-json
python-iso639

View File

@ -36,6 +36,8 @@ nltk==3.8.1
# via -r requirements/base.in
packaging==23.1
# via marshmallow
python-iso639==2023.6.15
# via -r requirements/base.in
python-magic==0.4.27
# via -r requirements/base.in
regex==2023.8.8

View File

@ -207,7 +207,7 @@ nbformat==5.9.2
# jupyter-server
# nbclient
# nbconvert
nest-asyncio==1.5.7
nest-asyncio==1.5.8
# via ipykernel
nodeenv==1.8.0
# via pre-commit

View File

@ -27,7 +27,7 @@ click==8.1.7
# via
# -c requirements/base.txt
# flask
contourpy==1.1.0
contourpy==1.1.1
# via matplotlib
cssselect==1.2.0
# via premailer
@ -148,7 +148,7 @@ psutil==5.9.5
# via visualdl
pyclipper==1.3.0.post5
# via unstructured-paddleocr
pycryptodome==3.18.0
pycryptodome==3.19.0
# via bce-python-sdk
pyparsing==3.0.9
# via

View File

@ -20,7 +20,7 @@ charset-normalizer==3.2.0
# requests
coloredlogs==15.0.1
# via onnxruntime
contourpy==1.1.0
contourpy==1.1.1
# via matplotlib
cryptography==41.0.3
# via pdfminer-six
@ -124,7 +124,7 @@ pillow==10.0.1
# pytesseract
# torchvision
# unstructured-pytesseract
portalocker==2.7.0
portalocker==2.8.2
# via iopath
protobuf==4.23.4
# via

View File

@ -4,7 +4,7 @@
#
# pip-compile requirements/ingest-azure.in
#
adlfs==2023.8.0
adlfs==2023.9.0
# via -r requirements/ingest-azure.in
aiohttp==3.8.5
# via adlfs
@ -71,7 +71,7 @@ multidict==6.0.4
# via
# aiohttp
# yarl
portalocker==2.7.0
portalocker==2.8.2
# via msal-extensions
pycparser==2.21
# via cffi

View File

@ -430,6 +430,20 @@ def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename(
assert point[1] is not math.nan
def test_partition_image_formats_languages_for_tesseract():
filename = "example-docs/jpn-vert.jpeg"
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
image.partition_image(filename=filename, strategy="hi_res", languages=["jpn_vert"])
mock_process.assert_called_once_with(
filename,
is_image=True,
ocr_languages="jpn_vert",
ocr_mode="entire_page",
extract_tables=False,
model_name=None,
)
def test_partition_image_warns_with_ocr_languages(caplog):
filename = "example-docs/layout-parser-paper-fast.jpg"
image.partition_image(filename=filename, strategy="hi_res", ocr_languages="eng")

View File

@ -840,6 +840,20 @@ def test_add_chunking_strategy_on_partition_pdf(
assert chunk_elements == chunks
def test_partition_pdf_formats_languages_for_tesseract():
filename = "example-docs/DA-1p.pdf"
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
pdf.partition_pdf(filename=filename, strategy="hi_res", languages=["en"])
mock_process.assert_called_once_with(
filename,
is_image=False,
ocr_languages="eng",
ocr_mode="entire_page",
extract_tables=False,
model_name=None,
)
def test_partition_pdf_warns_with_ocr_languages(caplog):
filename = "example-docs/chevron-page.pdf"
pdf.partition_pdf(filename=filename, strategy="hi_res", ocr_languages="eng")

View File

@ -369,6 +369,22 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, requ
assert elements[1].text.startswith("Zejiang Shen")
def test_auto_partition_formats_languages_for_tesseract():
filename = "example-docs/chi_sim_image.jpeg"
with patch(
"unstructured_inference.inference.layout.process_file_with_model",
) as mock_process_file_with_model:
partition(filename, strategy="hi_res", languages=["zh"])
mock_process_file_with_model.assert_called_once_with(
filename,
is_image=True,
ocr_languages="chi_sim+chi_sim_vert+chi_tra+chi_tra_vert",
ocr_mode="entire_page",
extract_tables=False,
model_name=None,
)
def test_auto_partition_warns_with_ocr_languages(caplog):
filename = "example-docs/chevron-page.pdf"
partition(filename=filename, strategy="hi_res", ocr_languages="eng")

View File

@ -0,0 +1,47 @@
from unstructured.partition import lang
def test_prepare_languages_for_tesseract_with_one_language():
languages = ["en"]
assert lang.prepare_languages_for_tesseract(languages) == "eng"
def test_prepare_languages_for_tesseract_special_case():
languages = ["osd"]
assert lang.prepare_languages_for_tesseract(languages) == "osd"
languages = ["equ"]
assert lang.prepare_languages_for_tesseract(languages) == "equ"
def test_prepare_languages_for_tesseract_removes_empty_inputs():
languages = ["kbd", "es"]
assert lang.prepare_languages_for_tesseract(languages) == "spa+spa_old"
def test_prepare_languages_for_tesseract_includes_variants():
languages = ["chi"]
assert (
lang.prepare_languages_for_tesseract(languages)
== "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
)
def test_prepare_languages_for_tesseract_with_multiple_languages():
languages = ["ja", "afr", "en", "equ"]
assert lang.prepare_languages_for_tesseract(languages) == "jpn+jpn_vert+afr+eng+equ"
def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog):
languages = ["zzz", "chi"]
assert (
lang.prepare_languages_for_tesseract(languages)
== "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
)
assert "not a valid standard language code" in caplog.text
def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog):
languages = ["kbd", "eng"]
assert lang.prepare_languages_for_tesseract(languages) == "eng"
assert "not a language supported by Tesseract" in caplog.text

View File

@ -1,13 +1,149 @@
from typing import List
import iso639
from unstructured.logger import logger
# pytesseract.get_languages(config="") only shows user installed language packs,
# so manually include the list of all currently supported Tesseract languages
PYTESSERACT_LANGS = [
"afr",
"amh",
"ara",
"asm",
"aze",
"aze_cyrl",
"bel",
"ben",
"bod",
"bos",
"bre",
"bul",
"cat",
"ceb",
"ces",
"chi_sim",
"chi_sim_vert",
"chi_tra",
"chi_tra_vert",
"chr",
"cos",
"cym",
"dan",
"deu",
"div",
"dzo",
"ell",
"eng",
"enm",
"epo",
"equ",
"est",
"eus",
"fao",
"fas",
"fil",
"fin",
"fra",
"frk",
"frm",
"fry",
"gla",
"gle",
"glg",
"grc",
"guj",
"hat",
"heb",
"hin",
"hrv",
"hun",
"hye",
"iku",
"ind",
"isl",
"ita",
"ita_old",
"jav",
"jpn",
"jpn_vert",
"kan",
"kat",
"kat_old",
"kaz",
"khm",
"kir",
"kmr",
"kor",
"kor_vert",
"lao",
"lat",
"lav",
"lit",
"ltz",
"mal",
"mar",
"mkd",
"mlt",
"mon",
"mri",
"msa",
"mya",
"nep",
"nld",
"nor",
"oci",
"ori",
"osd",
"pan",
"pol",
"por",
"pus",
"que",
"ron",
"rus",
"san",
"sin",
"slk",
"slv",
"snd",
"snum",
"spa",
"spa_old",
"sqi",
"srp",
"srp_latn",
"sun",
"swa",
"swe",
"syr",
"tam",
"tat",
"tel",
"tgk",
"tha",
"tir",
"ton",
"tur",
"uig",
"ukr",
"urd",
"uzb",
"uzb_cyrl",
"vie",
"yid",
"yor",
]
def prepare_languages_for_tesseract(languages: List[str] = ["eng"]):
"""
Convert the languages param (list of strings) into tesseract ocr langcode format (uses +) string
Entry point: convert languages (list of strings) into tesseract ocr langcode format (uses +)
"""
# NOTE(Shreya): assumes language codes are already in tesseract format (will be updated later)
return "+".join(languages)
converted_languages = list(
filter(None, [convert_language_to_tesseract(lang) for lang in languages]),
)
return "+".join(converted_languages)
def convert_old_ocr_languages_to_languages(ocr_languages: str):
@ -17,3 +153,51 @@ def convert_old_ocr_languages_to_languages(ocr_languages: str):
"""
return ocr_languages.split("+")
def convert_language_to_tesseract(lang: str) -> str:
"""
Convert a language code to its tesseract formatted and recognized langcode(s), if supported.
"""
# if language is already tesseract langcode, return it immediately
# this will catch the tesseract special cases equ and osd
# NOTE(shreya): this may catch some cases of choosing between tesseract code variants for a lang
if lang in PYTESSERACT_LANGS:
return lang
# get iso639 language object
try:
lang_iso639 = iso639.Language.match(lang.lower())
except iso639.LanguageNotFoundError:
logger.warning(f"{lang} is not a valid standard language code.")
return ""
# tesseract uses 3 digit codes (639-3, 639-2b, etc) as prefixes, with suffixes for orthography
# use first 3 letters of tesseract codes for matching to standard codes
pytesseract_langs_3 = {lang[:3] for lang in PYTESSERACT_LANGS}
# try to match ISO 639-3 code
if lang_iso639.part3 in pytesseract_langs_3:
matched_langcodes = _get_all_tesseract_langcodes_with_prefix(lang_iso639.part3)
return "+".join(matched_langcodes)
# try to match ISO 639-2b
elif lang_iso639.part2b in pytesseract_langs_3:
matched_langcodes = _get_all_tesseract_langcodes_with_prefix(lang_iso639.part2b)
return "+".join(matched_langcodes)
# try to match ISO 639-2t
elif lang_iso639.part2t in pytesseract_langs_3:
matched_langcodes = _get_all_tesseract_langcodes_with_prefix(lang_iso639.part2t)
return "+".join(matched_langcodes)
else:
logger.warning(f"{lang} is not a language supported by Tesseract.")
return ""
def _get_all_tesseract_langcodes_with_prefix(prefix: str):
"""
Get all matching tesseract langcodes with this prefix (may be one or multiple variants).
"""
return [langcode for langcode in PYTESSERACT_LANGS if langcode.startswith(prefix)]