From 6225a90ef0bf323372f09e06c43a2c3c904f4f85 Mon Sep 17 00:00:00 2001 From: johnlockejrr Date: Sat, 31 Aug 2024 18:18:37 -0700 Subject: [PATCH] Add support for Hebrew Language and Alphabet (#13797) * Add Hebrew language support for training https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet * Add Hebrew language dictionary https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet * Add Samaritan Script dictionary Samaritan Script is RTL like Arabic and Hebrew, used for Samaritan Hebrew and Aramaic, sometimes has Arabic letters in some texts. https://en.wikipedia.org/wiki/Samaritan_(Unicode_block) https://en.wikipedia.org/wiki/Samaritan_Hebrew https://en.wikipedia.org/wiki/Samaritan_Aramaic_language * Add Samaritan Script training Samaritan Script is RTL like Arabic and Hebrew, used for Samaritan Hebrew and Aramaic, sometimes has Arabic letters in some texts. https://en.wikipedia.org/wiki/Samaritan_(Unicode_block) https://en.wikipedia.org/wiki/Samaritan_Hebrew https://en.wikipedia.org/wiki/Samaritan_Aramaic_language * Update hebrew_dict.txt --- .../multi_language/rec_hebrew_lite_train.yml | 110 +++++++++ .../rec_samaritan_lite_train.yml | 110 +++++++++ ppocr/utils/dict/hebrew_dict.txt | 214 +++++++++++++++++ ppocr/utils/dict/samaritan_dict.txt | 222 ++++++++++++++++++ 4 files changed, 656 insertions(+) create mode 100644 configs/rec/multi_language/rec_hebrew_lite_train.yml create mode 100644 configs/rec/multi_language/rec_samaritan_lite_train.yml create mode 100644 ppocr/utils/dict/hebrew_dict.txt create mode 100644 ppocr/utils/dict/samaritan_dict.txt diff --git a/configs/rec/multi_language/rec_hebrew_lite_train.yml b/configs/rec/multi_language/rec_hebrew_lite_train.yml new file mode 100644 index 0000000000..056073eddc --- /dev/null +++ b/configs/rec/multi_language/rec_hebrew_lite_train.yml @@ -0,0 +1,110 @@ +Global: + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_hebrew_lite + save_epoch_step: 3 + eval_batch_step: + - 0 + - 2000 + cal_metric_during_train: true + pretrained_model: null + checkpoints: null + save_inference_dir: null + use_visualdl: false + infer_img: null + character_dict_path: ppocr/utils/dict/hebrew_dict.txt + max_text_length: 25 + infer_mode: false + use_space_char: true +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: L2 + factor: 1.0e-05 +Architecture: + model_type: rec + algorithm: CRNN + Transform: null + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: small + small_stride: + - 1 + - 2 + - 2 + - 2 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 48 + Head: + name: CTCHead + fc_decay: 1.0e-05 +Loss: + name: CTCLoss +PostProcess: + name: CTCLabelDecode +Metric: + name: RecMetric + main_indicator: acc +Train: + dataset: + name: SimpleDataSet + data_dir: train_data/ + label_file_list: + - train_data/hebrew_train.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: null + - CTCLabelEncode: null + - RecResizeImg: + image_shape: + - 3 + - 32 + - 320 + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: true + batch_size_per_card: 256 + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/ + label_file_list: + - train_data/hebrew_val.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - CTCLabelEncode: null + - RecResizeImg: + image_shape: + - 3 + - 32 + - 320 + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: false + drop_last: false + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/multi_language/rec_samaritan_lite_train.yml b/configs/rec/multi_language/rec_samaritan_lite_train.yml new file mode 100644 index 0000000000..82f9f2c775 --- /dev/null +++ b/configs/rec/multi_language/rec_samaritan_lite_train.yml @@ -0,0 +1,110 @@ +Global: + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_samaritan_lite + save_epoch_step: 3 + eval_batch_step: + - 0 + - 2000 + cal_metric_during_train: true + pretrained_model: null + checkpoints: null + save_inference_dir: null + use_visualdl: false + infer_img: null + character_dict_path: ppocr/utils/dict/samaritan_dict.txt + max_text_length: 25 + infer_mode: false + use_space_char: true +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: L2 + factor: 1.0e-05 +Architecture: + model_type: rec + algorithm: CRNN + Transform: null + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: small + small_stride: + - 1 + - 2 + - 2 + - 2 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 48 + Head: + name: CTCHead + fc_decay: 1.0e-05 +Loss: + name: CTCLoss +PostProcess: + name: CTCLabelDecode +Metric: + name: RecMetric + main_indicator: acc +Train: + dataset: + name: SimpleDataSet + data_dir: train_data/ + label_file_list: + - train_data/samaritan_train.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: null + - CTCLabelEncode: null + - RecResizeImg: + image_shape: + - 3 + - 32 + - 320 + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: true + batch_size_per_card: 256 + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/ + label_file_list: + - train_data/samaritan_val.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - CTCLabelEncode: null + - RecResizeImg: + image_shape: + - 3 + - 32 + - 320 + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: false + drop_last: false + batch_size_per_card: 256 + num_workers: 8 diff --git a/ppocr/utils/dict/hebrew_dict.txt b/ppocr/utils/dict/hebrew_dict.txt new file mode 100644 index 0000000000..ed301379f7 --- /dev/null +++ b/ppocr/utils/dict/hebrew_dict.txt @@ -0,0 +1,214 @@ +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +֑ +֒ +֓ +֔ +֕ +֖ +֗ +֘ +֙ +֚ +֛ +֜ +֝ +֞ +֟ +֠ +֡ +֢ +֣ +֤ +֥ +֦ +֧ +֨ +֩ +֪ +֫ +֬ +֭ +֮ +֯ +ְ +ֱ +ֲ +ֳ +ִ +ֵ +ֶ +ַ +ָ +ֹ +ֺ +ֻ +ּ +ֽ +־ +ֿ +׀ +ׁ +ׂ +׃ +ׄ +ׅ +׆ +ׇ +א +ב +ג +ד +ה +ו +ז +ח +ט +י +ך +כ +ל +ם +מ +ן +נ +ס +ע +ף +פ +ץ +צ +ק +ר +ש +ת +ׯ +װ +ױ +ײ +׳ +״ +יִ +ﬞ +ײַ +ﬠ +ﬡ +ﬢ +ﬣ +ﬤ +ﬥ +ﬦ +ﬧ +ﬨ +﬩ +שׁ +שׂ +שּׁ +שּׂ +אַ +אָ +אּ +בּ +גּ +דּ +הּ +וּ +זּ +טּ +יּ +ךּ +כּ +לּ +מּ +נּ +סּ +ףּ +פּ +צּ +קּ +רּ +שּ +תּ +וֹ +בֿ +כֿ +פֿ +ﭏ diff --git a/ppocr/utils/dict/samaritan_dict.txt b/ppocr/utils/dict/samaritan_dict.txt new file mode 100644 index 0000000000..1fe9187700 --- /dev/null +++ b/ppocr/utils/dict/samaritan_dict.txt @@ -0,0 +1,222 @@ +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ء +آ +أ +ؤ +إ +ئ +ا +ب +ة +ت +ث +ج +ح +خ +د +ذ +ر +ز +س +ش +ص +ض +ط +ظ +ع +غ +ف +ق +ك +ل +م +ن +ه +و +ى +ي +ً +ٌ +ٍ +َ +ُ +ِ +ّ +ْ +ٓ +ٔ +ٰ +ٱ +ٹ +پ +چ +ڈ +ڑ +ژ +ک +ڭ +گ +ں +ھ +ۀ +ہ +ۂ +ۃ +ۆ +ۇ +ۈ +ۋ +ی +ې +ے +ۓ +ە +١ +٢ +٣ +٤ +٥ +٦ +٧ +٨ +٩ +ࠀ +ࠁ +ࠂ +ࠃ +ࠄ +ࠅ +ࠆ +ࠇ +ࠈ +ࠉ +ࠊ +ࠋ +ࠌ +ࠍ +ࠎ +ࠏ +ࠐ +ࠑ +ࠒ +ࠓ +ࠔ +ࠕ +ࠖ +ࠗ +࠘ +࠙ +ࠚ +ࠛ +ࠜ +ࠝ +ࠞ +ࠟ +ࠠ +ࠡ +ࠢ +ࠣ +ࠤ +ࠥ +ࠦ +ࠧ +ࠨ +ࠩ +ࠪ +ࠫ +ࠬ +࠭ +࠰ +࠱ +࠲ +࠳ +࠴ +࠵ +࠶ +࠷ +࠸ +࠹ +࠺ +࠻ +࠼ +࠽ +࠾