mirror of
				https://github.com/PaddlePaddle/PaddleOCR.git
				synced 2025-10-31 09:49:30 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			67 lines
		
	
	
		
			2.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			67 lines
		
	
	
		
			2.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
 | |
| #
 | |
| # Licensed under the Apache License, Version 2.0 (the "License");
 | |
| # you may not use this file except in compliance with the License.
 | |
| # You may obtain a copy of the License at
 | |
| #
 | |
| #     http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # Unless required by applicable law or agreed to in writing, software
 | |
| # distributed under the License is distributed on an "AS IS" BASIS,
 | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| # See the License for the specific language governing permissions and
 | |
| # limitations under the License.
 | |
| import random
 | |
| 
 | |
| from utils.logging import get_logger
 | |
| 
 | |
| 
 | |
| class FileCorpus(object):
 | |
|     def __init__(self, config):
 | |
|         self.logger = get_logger()
 | |
|         self.logger.info("using FileCorpus")
 | |
| 
 | |
|         self.char_list = " 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 | |
| 
 | |
|         corpus_file = config["CorpusGenerator"]["corpus_file"]
 | |
|         self.language = config["CorpusGenerator"]["language"]
 | |
|         with open(corpus_file, 'r') as f:
 | |
|             corpus_raw = f.read()
 | |
|         self.corpus_list = corpus_raw.split("\n")[:-1]
 | |
|         assert len(self.corpus_list) > 0
 | |
|         random.shuffle(self.corpus_list)
 | |
|         self.index = 0
 | |
| 
 | |
|     def generate(self, corpus_length=0):
 | |
|         if self.index >= len(self.corpus_list):
 | |
|             self.index = 0
 | |
|             random.shuffle(self.corpus_list)
 | |
|         corpus = self.corpus_list[self.index]
 | |
|         if corpus_length != 0:
 | |
|             corpus = corpus[0:corpus_length]
 | |
|         if corpus_length > len(corpus):
 | |
|             self.logger.warning("generated corpus is shorter than expected.")
 | |
|         self.index += 1
 | |
|         return self.language, corpus
 | |
| 
 | |
| 
 | |
| class EnNumCorpus(object):
 | |
|     def __init__(self, config):
 | |
|         self.logger = get_logger()
 | |
|         self.logger.info("using NumberCorpus")
 | |
|         self.num_list = "0123456789"
 | |
|         self.en_char_list = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 | |
|         self.height = config["Global"]["image_height"]
 | |
|         self.max_width = config["Global"]["image_width"]
 | |
| 
 | |
|     def generate(self, corpus_length=0):
 | |
|         corpus = ""
 | |
|         if corpus_length == 0:
 | |
|             corpus_length = random.randint(5, 15)
 | |
|         for i in range(corpus_length):
 | |
|             if random.random() < 0.2:
 | |
|                 corpus += "{}".format(random.choice(self.en_char_list))
 | |
|             else:
 | |
|                 corpus += "{}".format(random.choice(self.num_list))
 | |
|         return "en", corpus
 | 
