mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-04 19:16:03 +00:00
chore: function to map between standard and Tesseract language codes (#1421)
### Summary In order to convert between incompatible language codes from packages used for OCR, this change adds a function to map between any standard language codes and tesseract OCR specific codes. Users can input language information to `languages` in any Tesseract-supported langcode or any ISO 639 standard language code. ### Details - Introduces the [python-iso639](https://pypi.org/project/python-iso639/) package for matching standard language codes. Recompiles all dependencies. - If a language is not already supplied by the user as a Tesseract specific langcode, supplies all possible script/orthography variants of the language to the Tesseract OCR agent. ### Test Added many unit tests for a variety of language combinations, special cases, and variants. For general testing, call partition functions with any lang codes in the languages parameter (Tesseract or standard). for example, ``` from unstructured.partition.auto import partition elements = partition(filename="example-docs/layout-parser-paper.pdf", strategy="hi_res", languages=["en", "chi"]) print("\n\n".join([str(el) for el in elements])) ``` should supply eng+chi_sim+chi_sim_vert+chi_tra+chi_tra_vert to Tesseract
This commit is contained in:
parent
3a07d1e6b4
commit
eb8ce89137
@ -2,6 +2,8 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Add a function to map between Tesseract and standard language codes.** This allows users to input language information to the `languages` param in any Tesseract-supported langcode or any ISO 639 standard language code.
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
BIN
example-docs/chi_sim_image.jpeg
Normal file
BIN
example-docs/chi_sim_image.jpeg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 20 KiB |
BIN
example-docs/jpn-vert.jpeg
Normal file
BIN
example-docs/jpn-vert.jpeg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 34 KiB |
@ -9,3 +9,4 @@ requests
|
||||
beautifulsoup4
|
||||
emoji
|
||||
dataclasses-json
|
||||
python-iso639
|
||||
@ -36,6 +36,8 @@ nltk==3.8.1
|
||||
# via -r requirements/base.in
|
||||
packaging==23.1
|
||||
# via marshmallow
|
||||
python-iso639==2023.6.15
|
||||
# via -r requirements/base.in
|
||||
python-magic==0.4.27
|
||||
# via -r requirements/base.in
|
||||
regex==2023.8.8
|
||||
|
||||
@ -207,7 +207,7 @@ nbformat==5.9.2
|
||||
# jupyter-server
|
||||
# nbclient
|
||||
# nbconvert
|
||||
nest-asyncio==1.5.7
|
||||
nest-asyncio==1.5.8
|
||||
# via ipykernel
|
||||
nodeenv==1.8.0
|
||||
# via pre-commit
|
||||
|
||||
@ -27,7 +27,7 @@ click==8.1.7
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# flask
|
||||
contourpy==1.1.0
|
||||
contourpy==1.1.1
|
||||
# via matplotlib
|
||||
cssselect==1.2.0
|
||||
# via premailer
|
||||
@ -148,7 +148,7 @@ psutil==5.9.5
|
||||
# via visualdl
|
||||
pyclipper==1.3.0.post5
|
||||
# via unstructured-paddleocr
|
||||
pycryptodome==3.18.0
|
||||
pycryptodome==3.19.0
|
||||
# via bce-python-sdk
|
||||
pyparsing==3.0.9
|
||||
# via
|
||||
|
||||
@ -20,7 +20,7 @@ charset-normalizer==3.2.0
|
||||
# requests
|
||||
coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
contourpy==1.1.0
|
||||
contourpy==1.1.1
|
||||
# via matplotlib
|
||||
cryptography==41.0.3
|
||||
# via pdfminer-six
|
||||
@ -124,7 +124,7 @@ pillow==10.0.1
|
||||
# pytesseract
|
||||
# torchvision
|
||||
# unstructured-pytesseract
|
||||
portalocker==2.7.0
|
||||
portalocker==2.8.2
|
||||
# via iopath
|
||||
protobuf==4.23.4
|
||||
# via
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile requirements/ingest-azure.in
|
||||
#
|
||||
adlfs==2023.8.0
|
||||
adlfs==2023.9.0
|
||||
# via -r requirements/ingest-azure.in
|
||||
aiohttp==3.8.5
|
||||
# via adlfs
|
||||
@ -71,7 +71,7 @@ multidict==6.0.4
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
portalocker==2.7.0
|
||||
portalocker==2.8.2
|
||||
# via msal-extensions
|
||||
pycparser==2.21
|
||||
# via cffi
|
||||
|
||||
@ -430,6 +430,20 @@ def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename(
|
||||
assert point[1] is not math.nan
|
||||
|
||||
|
||||
def test_partition_image_formats_languages_for_tesseract():
|
||||
filename = "example-docs/jpn-vert.jpeg"
|
||||
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
|
||||
image.partition_image(filename=filename, strategy="hi_res", languages=["jpn_vert"])
|
||||
mock_process.assert_called_once_with(
|
||||
filename,
|
||||
is_image=True,
|
||||
ocr_languages="jpn_vert",
|
||||
ocr_mode="entire_page",
|
||||
extract_tables=False,
|
||||
model_name=None,
|
||||
)
|
||||
|
||||
|
||||
def test_partition_image_warns_with_ocr_languages(caplog):
|
||||
filename = "example-docs/layout-parser-paper-fast.jpg"
|
||||
image.partition_image(filename=filename, strategy="hi_res", ocr_languages="eng")
|
||||
|
||||
@ -840,6 +840,20 @@ def test_add_chunking_strategy_on_partition_pdf(
|
||||
assert chunk_elements == chunks
|
||||
|
||||
|
||||
def test_partition_pdf_formats_languages_for_tesseract():
|
||||
filename = "example-docs/DA-1p.pdf"
|
||||
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
|
||||
pdf.partition_pdf(filename=filename, strategy="hi_res", languages=["en"])
|
||||
mock_process.assert_called_once_with(
|
||||
filename,
|
||||
is_image=False,
|
||||
ocr_languages="eng",
|
||||
ocr_mode="entire_page",
|
||||
extract_tables=False,
|
||||
model_name=None,
|
||||
)
|
||||
|
||||
|
||||
def test_partition_pdf_warns_with_ocr_languages(caplog):
|
||||
filename = "example-docs/chevron-page.pdf"
|
||||
pdf.partition_pdf(filename=filename, strategy="hi_res", ocr_languages="eng")
|
||||
|
||||
@ -369,6 +369,22 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, requ
|
||||
assert elements[1].text.startswith("Zejiang Shen")
|
||||
|
||||
|
||||
def test_auto_partition_formats_languages_for_tesseract():
|
||||
filename = "example-docs/chi_sim_image.jpeg"
|
||||
with patch(
|
||||
"unstructured_inference.inference.layout.process_file_with_model",
|
||||
) as mock_process_file_with_model:
|
||||
partition(filename, strategy="hi_res", languages=["zh"])
|
||||
mock_process_file_with_model.assert_called_once_with(
|
||||
filename,
|
||||
is_image=True,
|
||||
ocr_languages="chi_sim+chi_sim_vert+chi_tra+chi_tra_vert",
|
||||
ocr_mode="entire_page",
|
||||
extract_tables=False,
|
||||
model_name=None,
|
||||
)
|
||||
|
||||
|
||||
def test_auto_partition_warns_with_ocr_languages(caplog):
|
||||
filename = "example-docs/chevron-page.pdf"
|
||||
partition(filename=filename, strategy="hi_res", ocr_languages="eng")
|
||||
|
||||
47
test_unstructured/partition/test_lang.py
Normal file
47
test_unstructured/partition/test_lang.py
Normal file
@ -0,0 +1,47 @@
|
||||
from unstructured.partition import lang
|
||||
|
||||
|
||||
def test_prepare_languages_for_tesseract_with_one_language():
|
||||
languages = ["en"]
|
||||
assert lang.prepare_languages_for_tesseract(languages) == "eng"
|
||||
|
||||
|
||||
def test_prepare_languages_for_tesseract_special_case():
|
||||
languages = ["osd"]
|
||||
assert lang.prepare_languages_for_tesseract(languages) == "osd"
|
||||
|
||||
languages = ["equ"]
|
||||
assert lang.prepare_languages_for_tesseract(languages) == "equ"
|
||||
|
||||
|
||||
def test_prepare_languages_for_tesseract_removes_empty_inputs():
|
||||
languages = ["kbd", "es"]
|
||||
assert lang.prepare_languages_for_tesseract(languages) == "spa+spa_old"
|
||||
|
||||
|
||||
def test_prepare_languages_for_tesseract_includes_variants():
|
||||
languages = ["chi"]
|
||||
assert (
|
||||
lang.prepare_languages_for_tesseract(languages)
|
||||
== "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
|
||||
)
|
||||
|
||||
|
||||
def test_prepare_languages_for_tesseract_with_multiple_languages():
|
||||
languages = ["ja", "afr", "en", "equ"]
|
||||
assert lang.prepare_languages_for_tesseract(languages) == "jpn+jpn_vert+afr+eng+equ"
|
||||
|
||||
|
||||
def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog):
|
||||
languages = ["zzz", "chi"]
|
||||
assert (
|
||||
lang.prepare_languages_for_tesseract(languages)
|
||||
== "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
|
||||
)
|
||||
assert "not a valid standard language code" in caplog.text
|
||||
|
||||
|
||||
def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog):
|
||||
languages = ["kbd", "eng"]
|
||||
assert lang.prepare_languages_for_tesseract(languages) == "eng"
|
||||
assert "not a language supported by Tesseract" in caplog.text
|
||||
@ -1,13 +1,149 @@
|
||||
from typing import List
|
||||
|
||||
import iso639
|
||||
|
||||
from unstructured.logger import logger
|
||||
|
||||
# pytesseract.get_languages(config="") only shows user installed language packs,
|
||||
# so manually include the list of all currently supported Tesseract languages
|
||||
PYTESSERACT_LANGS = [
|
||||
"afr",
|
||||
"amh",
|
||||
"ara",
|
||||
"asm",
|
||||
"aze",
|
||||
"aze_cyrl",
|
||||
"bel",
|
||||
"ben",
|
||||
"bod",
|
||||
"bos",
|
||||
"bre",
|
||||
"bul",
|
||||
"cat",
|
||||
"ceb",
|
||||
"ces",
|
||||
"chi_sim",
|
||||
"chi_sim_vert",
|
||||
"chi_tra",
|
||||
"chi_tra_vert",
|
||||
"chr",
|
||||
"cos",
|
||||
"cym",
|
||||
"dan",
|
||||
"deu",
|
||||
"div",
|
||||
"dzo",
|
||||
"ell",
|
||||
"eng",
|
||||
"enm",
|
||||
"epo",
|
||||
"equ",
|
||||
"est",
|
||||
"eus",
|
||||
"fao",
|
||||
"fas",
|
||||
"fil",
|
||||
"fin",
|
||||
"fra",
|
||||
"frk",
|
||||
"frm",
|
||||
"fry",
|
||||
"gla",
|
||||
"gle",
|
||||
"glg",
|
||||
"grc",
|
||||
"guj",
|
||||
"hat",
|
||||
"heb",
|
||||
"hin",
|
||||
"hrv",
|
||||
"hun",
|
||||
"hye",
|
||||
"iku",
|
||||
"ind",
|
||||
"isl",
|
||||
"ita",
|
||||
"ita_old",
|
||||
"jav",
|
||||
"jpn",
|
||||
"jpn_vert",
|
||||
"kan",
|
||||
"kat",
|
||||
"kat_old",
|
||||
"kaz",
|
||||
"khm",
|
||||
"kir",
|
||||
"kmr",
|
||||
"kor",
|
||||
"kor_vert",
|
||||
"lao",
|
||||
"lat",
|
||||
"lav",
|
||||
"lit",
|
||||
"ltz",
|
||||
"mal",
|
||||
"mar",
|
||||
"mkd",
|
||||
"mlt",
|
||||
"mon",
|
||||
"mri",
|
||||
"msa",
|
||||
"mya",
|
||||
"nep",
|
||||
"nld",
|
||||
"nor",
|
||||
"oci",
|
||||
"ori",
|
||||
"osd",
|
||||
"pan",
|
||||
"pol",
|
||||
"por",
|
||||
"pus",
|
||||
"que",
|
||||
"ron",
|
||||
"rus",
|
||||
"san",
|
||||
"sin",
|
||||
"slk",
|
||||
"slv",
|
||||
"snd",
|
||||
"snum",
|
||||
"spa",
|
||||
"spa_old",
|
||||
"sqi",
|
||||
"srp",
|
||||
"srp_latn",
|
||||
"sun",
|
||||
"swa",
|
||||
"swe",
|
||||
"syr",
|
||||
"tam",
|
||||
"tat",
|
||||
"tel",
|
||||
"tgk",
|
||||
"tha",
|
||||
"tir",
|
||||
"ton",
|
||||
"tur",
|
||||
"uig",
|
||||
"ukr",
|
||||
"urd",
|
||||
"uzb",
|
||||
"uzb_cyrl",
|
||||
"vie",
|
||||
"yid",
|
||||
"yor",
|
||||
]
|
||||
|
||||
|
||||
def prepare_languages_for_tesseract(languages: List[str] = ["eng"]):
|
||||
"""
|
||||
Convert the languages param (list of strings) into tesseract ocr langcode format (uses +) string
|
||||
Entry point: convert languages (list of strings) into tesseract ocr langcode format (uses +)
|
||||
"""
|
||||
# NOTE(Shreya): assumes language codes are already in tesseract format (will be updated later)
|
||||
|
||||
return "+".join(languages)
|
||||
converted_languages = list(
|
||||
filter(None, [convert_language_to_tesseract(lang) for lang in languages]),
|
||||
)
|
||||
return "+".join(converted_languages)
|
||||
|
||||
|
||||
def convert_old_ocr_languages_to_languages(ocr_languages: str):
|
||||
@ -17,3 +153,51 @@ def convert_old_ocr_languages_to_languages(ocr_languages: str):
|
||||
"""
|
||||
|
||||
return ocr_languages.split("+")
|
||||
|
||||
|
||||
def convert_language_to_tesseract(lang: str) -> str:
|
||||
"""
|
||||
Convert a language code to its tesseract formatted and recognized langcode(s), if supported.
|
||||
"""
|
||||
# if language is already tesseract langcode, return it immediately
|
||||
# this will catch the tesseract special cases equ and osd
|
||||
# NOTE(shreya): this may catch some cases of choosing between tesseract code variants for a lang
|
||||
if lang in PYTESSERACT_LANGS:
|
||||
return lang
|
||||
|
||||
# get iso639 language object
|
||||
try:
|
||||
lang_iso639 = iso639.Language.match(lang.lower())
|
||||
except iso639.LanguageNotFoundError:
|
||||
logger.warning(f"{lang} is not a valid standard language code.")
|
||||
return ""
|
||||
|
||||
# tesseract uses 3 digit codes (639-3, 639-2b, etc) as prefixes, with suffixes for orthography
|
||||
# use first 3 letters of tesseract codes for matching to standard codes
|
||||
pytesseract_langs_3 = {lang[:3] for lang in PYTESSERACT_LANGS}
|
||||
|
||||
# try to match ISO 639-3 code
|
||||
if lang_iso639.part3 in pytesseract_langs_3:
|
||||
matched_langcodes = _get_all_tesseract_langcodes_with_prefix(lang_iso639.part3)
|
||||
return "+".join(matched_langcodes)
|
||||
|
||||
# try to match ISO 639-2b
|
||||
elif lang_iso639.part2b in pytesseract_langs_3:
|
||||
matched_langcodes = _get_all_tesseract_langcodes_with_prefix(lang_iso639.part2b)
|
||||
return "+".join(matched_langcodes)
|
||||
|
||||
# try to match ISO 639-2t
|
||||
elif lang_iso639.part2t in pytesseract_langs_3:
|
||||
matched_langcodes = _get_all_tesseract_langcodes_with_prefix(lang_iso639.part2t)
|
||||
return "+".join(matched_langcodes)
|
||||
|
||||
else:
|
||||
logger.warning(f"{lang} is not a language supported by Tesseract.")
|
||||
return ""
|
||||
|
||||
|
||||
def _get_all_tesseract_langcodes_with_prefix(prefix: str):
|
||||
"""
|
||||
Get all matching tesseract langcodes with this prefix (may be one or multiple variants).
|
||||
"""
|
||||
return [langcode for langcode in PYTESSERACT_LANGS if langcode.startswith(prefix)]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user