Add support for Hebrew Language and Alphabet (#13797)

* Add Hebrew language support for training https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet * Add Hebrew language dictionary https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet * Add Samaritan Script dictionary Samaritan Script is RTL like Arabic and Hebrew, used for Samaritan Hebrew and Aramaic, sometimes has Arabic letters in some texts. https://en.wikipedia.org/wiki/Samaritan_(Unicode_block) https://en.wikipedia.org/wiki/Samaritan_Hebrew https://en.wikipedia.org/wiki/Samaritan_Aramaic_language * Add Samaritan Script training Samaritan Script is RTL like Arabic and Hebrew, used for Samaritan Hebrew and Aramaic, sometimes has Arabic letters in some texts. https://en.wikipedia.org/wiki/Samaritan_(Unicode_block) https://en.wikipedia.org/wiki/Samaritan_Hebrew https://en.wikipedia.org/wiki/Samaritan_Aramaic_language * Update hebrew_dict.txt
2025-06-26 21:24:27 +00:00 · 2024-08-31 18:18:37 -07:00 · 2024-08-31 18:18:37 -07:00 · 6225a90ef0
commit 6225a90ef0
parent 77f4c01f69
4 changed files with 656 additions and 0 deletions
--- a/configs/rec/multi_language/rec_hebrew_lite_train.yml
+++ b/configs/rec/multi_language/rec_hebrew_lite_train.yml
@ -0,0 +1,110 @@
+Global:
+  use_gpu: true
+  epoch_num: 500
+  log_smooth_window: 20
+  print_batch_step: 10
+  save_model_dir: ./output/rec_hebrew_lite
+  save_epoch_step: 3
+  eval_batch_step:
+  - 0
+  - 2000
+  cal_metric_during_train: true
+  pretrained_model: null
+  checkpoints: null
+  save_inference_dir: null
+  use_visualdl: false
+  infer_img: null
+  character_dict_path: ppocr/utils/dict/hebrew_dict.txt
+  max_text_length: 25
+  infer_mode: false
+  use_space_char: true
+Optimizer:
+  name: Adam
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Cosine
+    learning_rate: 0.001
+  regularizer:
+    name: L2
+    factor: 1.0e-05
+Architecture:
+  model_type: rec
+  algorithm: CRNN
+  Transform: null
+  Backbone:
+    name: MobileNetV3
+    scale: 0.5
+    model_name: small
+    small_stride:
+    - 1
+    - 2
+    - 2
+    - 2
+  Neck:
+    name: SequenceEncoder
+    encoder_type: rnn
+    hidden_size: 48
+  Head:
+    name: CTCHead
+    fc_decay: 1.0e-05
+Loss:
+  name: CTCLoss
+PostProcess:
+  name: CTCLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/
+    label_file_list:
+    - train_data/hebrew_train.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - RecAug: null
+    - CTCLabelEncode: null
+    - RecResizeImg:
+        image_shape:
+        - 3
+        - 32
+        - 320
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label
+        - length
+  loader:
+    shuffle: true
+    batch_size_per_card: 256
+    drop_last: true
+    num_workers: 8
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/
+    label_file_list:
+    - train_data/hebrew_val.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - CTCLabelEncode: null
+    - RecResizeImg:
+        image_shape:
+        - 3
+        - 32
+        - 320
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label
+        - length
+  loader:
+    shuffle: false
+    drop_last: false
+    batch_size_per_card: 256
+    num_workers: 8
--- a/configs/rec/multi_language/rec_samaritan_lite_train.yml
+++ b/configs/rec/multi_language/rec_samaritan_lite_train.yml
@ -0,0 +1,110 @@
+Global:
+  use_gpu: true
+  epoch_num: 500
+  log_smooth_window: 20
+  print_batch_step: 10
+  save_model_dir: ./output/rec_samaritan_lite
+  save_epoch_step: 3
+  eval_batch_step:
+  - 0
+  - 2000
+  cal_metric_during_train: true
+  pretrained_model: null
+  checkpoints: null
+  save_inference_dir: null
+  use_visualdl: false
+  infer_img: null
+  character_dict_path: ppocr/utils/dict/samaritan_dict.txt
+  max_text_length: 25
+  infer_mode: false
+  use_space_char: true
+Optimizer:
+  name: Adam
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Cosine
+    learning_rate: 0.001
+  regularizer:
+    name: L2
+    factor: 1.0e-05
+Architecture:
+  model_type: rec
+  algorithm: CRNN
+  Transform: null
+  Backbone:
+    name: MobileNetV3
+    scale: 0.5
+    model_name: small
+    small_stride:
+    - 1
+    - 2
+    - 2
+    - 2
+  Neck:
+    name: SequenceEncoder
+    encoder_type: rnn
+    hidden_size: 48
+  Head:
+    name: CTCHead
+    fc_decay: 1.0e-05
+Loss:
+  name: CTCLoss
+PostProcess:
+  name: CTCLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/
+    label_file_list:
+    - train_data/samaritan_train.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - RecAug: null
+    - CTCLabelEncode: null
+    - RecResizeImg:
+        image_shape:
+        - 3
+        - 32
+        - 320
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label
+        - length
+  loader:
+    shuffle: true
+    batch_size_per_card: 256
+    drop_last: true
+    num_workers: 8
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/
+    label_file_list:
+    - train_data/samaritan_val.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - CTCLabelEncode: null
+    - RecResizeImg:
+        image_shape:
+        - 3
+        - 32
+        - 320
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label
+        - length
+  loader:
+    shuffle: false
+    drop_last: false
+    batch_size_per_card: 256
+    num_workers: 8
--- a/ppocr/utils/dict/hebrew_dict.txt
+++ b/ppocr/utils/dict/hebrew_dict.txt
@ -0,0 +1,214 @@
+!
+#
+$
+%
+&
+'
+(
+
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+?
+@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+_
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+É
+é
+֑
+֒
+֓
+֔
+֕
+֖
+֗
+֘
+֙
+֚
+֛
+֜
+֝
+֞
+֟
+֠
+֡
+֢
+֣
+֤
+֥
+֦
+֧
+֨
+֩
+֪
+֫
+֬
+֭
+֮
+֯
+ְ
+ֱ
+ֲ
+ֳ
+ִ
+ֵ
+ֶ
+ַ
+ָ
+ֹ
+ֺ
+ֻ
+ּ
+ֽ
+־
+ֿ
+׀
+ׁ
+ׂ
+׃
+ׄ
+ׅ
+׆
+ׇ
+א
+ב
+ג
+ד
+ה
+ו
+ז
+ח
+ט
+י
+ך
+כ
+ל
+ם
+מ
+ן
+נ
+ס
+ע
+ף
+פ
+ץ
+צ
+ק
+ר
+ש
+ת
+ׯ
+װ
+ױ
+ײ
+׳
+״
+יִ
+ﬞ
+ײַ
+ﬠ
+ﬡ
+ﬢ
+ﬣ
+ﬤ
+ﬥ
+ﬦ
+ﬧ
+ﬨ
+﬩
+שׁ
+שׂ
+שּׁ
+שּׂ
+אַ
+אָ
+אּ
+בּ
+גּ
+דּ
+הּ
+וּ
+זּ
+טּ
+יּ
+ךּ
+כּ
+לּ
+מּ
+נּ
+סּ
+ףּ
+פּ
+צּ
+קּ
+רּ
+שּ
+תּ
+וֹ
+בֿ
+כֿ
+פֿ
+ﭏ
--- a/ppocr/utils/dict/samaritan_dict.txt
+++ b/ppocr/utils/dict/samaritan_dict.txt
@ -0,0 +1,222 @@
+!
+#
+$
+%
+&
+'
+(
+
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+?
+@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+_
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+É
+é
+ء
+آ
+أ
+ؤ
+إ
+ئ
+ا
+ب
+ة
+ت
+ث
+ج
+ح
+خ
+د
+ذ
+ر
+ز
+س
+ش
+ص
+ض
+ط
+ظ
+ع
+غ
+ف
+ق
+ك
+ل
+م
+ن
+ه
+و
+ى
+ي
+ً
+ٌ
+ٍ
+َ
+ُ
+ِ
+ّ
+ْ
+ٓ
+ٔ
+ٰ
+ٱ
+ٹ
+پ
+چ
+ڈ
+ڑ
+ژ
+ک
+ڭ
+گ
+ں
+ھ
+ۀ
+ہ
+ۂ
+ۃ
+ۆ
+ۇ
+ۈ
+ۋ
+ی
+ې
+ے
+ۓ
+ە
+١
+٢
+٣
+٤
+٥
+٦
+٧
+٨
+٩
+ࠀ
+ࠁ
+ࠂ
+ࠃ
+ࠄ
+ࠅ
+ࠆ
+ࠇ
+ࠈ
+ࠉ
+ࠊ
+ࠋ
+ࠌ
+ࠍ
+ࠎ
+ࠏ
+ࠐ
+ࠑ
+ࠒ
+ࠓ
+ࠔ
+ࠕ
+ࠖ
+ࠗ
+࠘
+࠙
+ࠚ
+ࠛ
+ࠜ
+ࠝ
+ࠞ
+ࠟ
+ࠠ
+ࠡ
+ࠢ
+ࠣ
+ࠤ
+ࠥ
+ࠦ
+ࠧ
+ࠨ
+ࠩ
+ࠪ
+ࠫ
+ࠬ
+࠭
+࠰
+࠱
+࠲
+࠳
+࠴
+࠵
+࠶
+࠷
+࠸
+࠹
+࠺
+࠻
+࠼
+࠽
+࠾