mirror of
				https://github.com/PaddlePaddle/PaddleOCR.git
				synced 2025-10-31 09:49:30 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			320 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			320 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
 | |
| #
 | |
| # Licensed under the Apache License, Version 2.0 (the "License");
 | |
| # you may not use this file except in compliance with the License.
 | |
| # You may obtain a copy of the License at
 | |
| #
 | |
| #    http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # Unless required by applicable law or agreed to in writing, software
 | |
| # distributed under the License is distributed on an "AS IS" BASIS,
 | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| # See the License for the specific language governing permissions and
 | |
| # limitations under the License.
 | |
| """
 | |
| This code is refer from:
 | |
| https://github.com/LBH1024/CAN/models/can.py
 | |
| https://github.com/LBH1024/CAN/models/counting.py
 | |
| https://github.com/LBH1024/CAN/models/decoder.py
 | |
| https://github.com/LBH1024/CAN/models/attention.py
 | |
| 
 | |
| """
 | |
| 
 | |
| from __future__ import absolute_import
 | |
| from __future__ import division
 | |
| from __future__ import print_function
 | |
| 
 | |
| import paddle.nn as nn
 | |
| import paddle
 | |
| import math
 | |
| '''
 | |
| Counting Module
 | |
| '''
 | |
| 
 | |
| 
 | |
| class ChannelAtt(nn.Layer):
 | |
|     def __init__(self, channel, reduction):
 | |
|         super(ChannelAtt, self).__init__()
 | |
|         self.avg_pool = nn.AdaptiveAvgPool2D(1)
 | |
| 
 | |
|         self.fc = nn.Sequential(
 | |
|             nn.Linear(channel, channel // reduction),
 | |
|             nn.ReLU(), nn.Linear(channel // reduction, channel), nn.Sigmoid())
 | |
| 
 | |
|     def forward(self, x):
 | |
|         b, c, _, _ = x.shape
 | |
|         y = paddle.reshape(self.avg_pool(x), [b, c])
 | |
|         y = paddle.reshape(self.fc(y), [b, c, 1, 1])
 | |
|         return x * y
 | |
| 
 | |
| 
 | |
| class CountingDecoder(nn.Layer):
 | |
|     def __init__(self, in_channel, out_channel, kernel_size):
 | |
|         super(CountingDecoder, self).__init__()
 | |
|         self.in_channel = in_channel
 | |
|         self.out_channel = out_channel
 | |
| 
 | |
|         self.trans_layer = nn.Sequential(
 | |
|             nn.Conv2D(
 | |
|                 self.in_channel,
 | |
|                 512,
 | |
|                 kernel_size=kernel_size,
 | |
|                 padding=kernel_size // 2,
 | |
|                 bias_attr=False),
 | |
|             nn.BatchNorm2D(512))
 | |
| 
 | |
|         self.channel_att = ChannelAtt(512, 16)
 | |
| 
 | |
|         self.pred_layer = nn.Sequential(
 | |
|             nn.Conv2D(
 | |
|                 512, self.out_channel, kernel_size=1, bias_attr=False),
 | |
|             nn.Sigmoid())
 | |
| 
 | |
|     def forward(self, x, mask):
 | |
|         b, _, h, w = x.shape
 | |
|         x = self.trans_layer(x)
 | |
|         x = self.channel_att(x)
 | |
|         x = self.pred_layer(x)
 | |
| 
 | |
|         if mask is not None:
 | |
|             x = x * mask
 | |
|         x = paddle.reshape(x, [b, self.out_channel, -1])
 | |
|         x1 = paddle.sum(x, axis=-1)
 | |
| 
 | |
|         return x1, paddle.reshape(x, [b, self.out_channel, h, w])
 | |
| 
 | |
| 
 | |
| '''
 | |
| Attention Decoder
 | |
| '''
 | |
| 
 | |
| 
 | |
| class PositionEmbeddingSine(nn.Layer):
 | |
|     def __init__(self,
 | |
|                  num_pos_feats=64,
 | |
|                  temperature=10000,
 | |
|                  normalize=False,
 | |
|                  scale=None):
 | |
|         super().__init__()
 | |
|         self.num_pos_feats = num_pos_feats
 | |
|         self.temperature = temperature
 | |
|         self.normalize = normalize
 | |
|         if scale is not None and normalize is False:
 | |
|             raise ValueError("normalize should be True if scale is passed")
 | |
|         if scale is None:
 | |
|             scale = 2 * math.pi
 | |
|         self.scale = scale
 | |
| 
 | |
|     def forward(self, x, mask):
 | |
|         y_embed = paddle.cumsum(mask, 1, dtype='float32')
 | |
|         x_embed = paddle.cumsum(mask, 2, dtype='float32')
 | |
| 
 | |
|         if self.normalize:
 | |
|             eps = 1e-6
 | |
|             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
 | |
|             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 | |
|         dim_t = paddle.arange(self.num_pos_feats, dtype='float32')
 | |
|         dim_d = paddle.expand(paddle.to_tensor(2), dim_t.shape)
 | |
|         dim_t = self.temperature**(2 * (dim_t / dim_d).astype('int64') /
 | |
|                                    self.num_pos_feats)
 | |
| 
 | |
|         pos_x = paddle.unsqueeze(x_embed, [3]) / dim_t
 | |
|         pos_y = paddle.unsqueeze(y_embed, [3]) / dim_t
 | |
| 
 | |
|         pos_x = paddle.flatten(
 | |
|             paddle.stack(
 | |
|                 [
 | |
|                     paddle.sin(pos_x[:, :, :, 0::2]),
 | |
|                     paddle.cos(pos_x[:, :, :, 1::2])
 | |
|                 ],
 | |
|                 axis=4),
 | |
|             3)
 | |
|         pos_y = paddle.flatten(
 | |
|             paddle.stack(
 | |
|                 [
 | |
|                     paddle.sin(pos_y[:, :, :, 0::2]),
 | |
|                     paddle.cos(pos_y[:, :, :, 1::2])
 | |
|                 ],
 | |
|                 axis=4),
 | |
|             3)
 | |
| 
 | |
|         pos = paddle.transpose(
 | |
|             paddle.concat(
 | |
|                 [pos_y, pos_x], axis=3), [0, 3, 1, 2])
 | |
| 
 | |
|         return pos
 | |
| 
 | |
| 
 | |
| class AttDecoder(nn.Layer):
 | |
|     def __init__(self, ratio, is_train, input_size, hidden_size,
 | |
|                  encoder_out_channel, dropout, dropout_ratio, word_num,
 | |
|                  counting_decoder_out_channel, attention):
 | |
|         super(AttDecoder, self).__init__()
 | |
|         self.input_size = input_size
 | |
|         self.hidden_size = hidden_size
 | |
|         self.out_channel = encoder_out_channel
 | |
|         self.attention_dim = attention['attention_dim']
 | |
|         self.dropout_prob = dropout
 | |
|         self.ratio = ratio
 | |
|         self.word_num = word_num
 | |
| 
 | |
|         self.counting_num = counting_decoder_out_channel
 | |
|         self.is_train = is_train
 | |
| 
 | |
|         self.init_weight = nn.Linear(self.out_channel, self.hidden_size)
 | |
|         self.embedding = nn.Embedding(self.word_num, self.input_size)
 | |
|         self.word_input_gru = nn.GRUCell(self.input_size, self.hidden_size)
 | |
|         self.word_attention = Attention(hidden_size, attention['attention_dim'])
 | |
| 
 | |
|         self.encoder_feature_conv = nn.Conv2D(
 | |
|             self.out_channel,
 | |
|             self.attention_dim,
 | |
|             kernel_size=attention['word_conv_kernel'],
 | |
|             padding=attention['word_conv_kernel'] // 2)
 | |
| 
 | |
|         self.word_state_weight = nn.Linear(self.hidden_size, self.hidden_size)
 | |
|         self.word_embedding_weight = nn.Linear(self.input_size,
 | |
|                                                self.hidden_size)
 | |
|         self.word_context_weight = nn.Linear(self.out_channel, self.hidden_size)
 | |
|         self.counting_context_weight = nn.Linear(self.counting_num,
 | |
|                                                  self.hidden_size)
 | |
|         self.word_convert = nn.Linear(self.hidden_size, self.word_num)
 | |
| 
 | |
|         if dropout:
 | |
|             self.dropout = nn.Dropout(dropout_ratio)
 | |
| 
 | |
|     def forward(self, cnn_features, labels, counting_preds, images_mask):
 | |
|         if self.is_train:
 | |
|             _, num_steps = labels.shape
 | |
|         else:
 | |
|             num_steps = 36
 | |
| 
 | |
|         batch_size, _, height, width = cnn_features.shape
 | |
|         images_mask = images_mask[:, :, ::self.ratio, ::self.ratio]
 | |
| 
 | |
|         word_probs = paddle.zeros((batch_size, num_steps, self.word_num))
 | |
|         word_alpha_sum = paddle.zeros((batch_size, 1, height, width))
 | |
| 
 | |
|         hidden = self.init_hidden(cnn_features, images_mask)
 | |
|         counting_context_weighted = self.counting_context_weight(counting_preds)
 | |
|         cnn_features_trans = self.encoder_feature_conv(cnn_features)
 | |
| 
 | |
|         position_embedding = PositionEmbeddingSine(256, normalize=True)
 | |
|         pos = position_embedding(cnn_features_trans, images_mask[:, 0, :, :])
 | |
| 
 | |
|         cnn_features_trans = cnn_features_trans + pos
 | |
| 
 | |
|         word = paddle.ones([batch_size, 1], dtype='int64')  # init word as sos
 | |
|         word = word.squeeze(axis=1)
 | |
|         for i in range(num_steps):
 | |
|             word_embedding = self.embedding(word)
 | |
|             _, hidden = self.word_input_gru(word_embedding, hidden)
 | |
|             word_context_vec, _, word_alpha_sum = self.word_attention(
 | |
|                 cnn_features, cnn_features_trans, hidden, word_alpha_sum,
 | |
|                 images_mask)
 | |
| 
 | |
|             current_state = self.word_state_weight(hidden)
 | |
|             word_weighted_embedding = self.word_embedding_weight(word_embedding)
 | |
|             word_context_weighted = self.word_context_weight(word_context_vec)
 | |
| 
 | |
|             if self.dropout_prob:
 | |
|                 word_out_state = self.dropout(
 | |
|                     current_state + word_weighted_embedding +
 | |
|                     word_context_weighted + counting_context_weighted)
 | |
|             else:
 | |
|                 word_out_state = current_state + word_weighted_embedding + word_context_weighted + counting_context_weighted
 | |
| 
 | |
|             word_prob = self.word_convert(word_out_state)
 | |
|             word_probs[:, i] = word_prob
 | |
| 
 | |
|             if self.is_train:
 | |
|                 word = labels[:, i]
 | |
|             else:
 | |
|                 word = word_prob.argmax(1)
 | |
|                 word = paddle.multiply(
 | |
|                     word, labels[:, i]
 | |
|                 )  # labels are oneslike tensor in infer/predict mode
 | |
| 
 | |
|         return word_probs
 | |
| 
 | |
|     def init_hidden(self, features, feature_mask):
 | |
|         average = paddle.sum(paddle.sum(features * feature_mask, axis=-1),
 | |
|                              axis=-1) / paddle.sum(
 | |
|                                  (paddle.sum(feature_mask, axis=-1)), axis=-1)
 | |
|         average = self.init_weight(average)
 | |
|         return paddle.tanh(average)
 | |
| 
 | |
| 
 | |
| '''
 | |
| Attention Module
 | |
| '''
 | |
| 
 | |
| 
 | |
| class Attention(nn.Layer):
 | |
|     def __init__(self, hidden_size, attention_dim):
 | |
|         super(Attention, self).__init__()
 | |
|         self.hidden = hidden_size
 | |
|         self.attention_dim = attention_dim
 | |
|         self.hidden_weight = nn.Linear(self.hidden, self.attention_dim)
 | |
|         self.attention_conv = nn.Conv2D(
 | |
|             1, 512, kernel_size=11, padding=5, bias_attr=False)
 | |
|         self.attention_weight = nn.Linear(
 | |
|             512, self.attention_dim, bias_attr=False)
 | |
|         self.alpha_convert = nn.Linear(self.attention_dim, 1)
 | |
| 
 | |
|     def forward(self,
 | |
|                 cnn_features,
 | |
|                 cnn_features_trans,
 | |
|                 hidden,
 | |
|                 alpha_sum,
 | |
|                 image_mask=None):
 | |
|         query = self.hidden_weight(hidden)
 | |
|         alpha_sum_trans = self.attention_conv(alpha_sum)
 | |
|         coverage_alpha = self.attention_weight(
 | |
|             paddle.transpose(alpha_sum_trans, [0, 2, 3, 1]))
 | |
|         alpha_score = paddle.tanh(
 | |
|             paddle.unsqueeze(query, [1, 2]) + coverage_alpha + paddle.transpose(
 | |
|                 cnn_features_trans, [0, 2, 3, 1]))
 | |
|         energy = self.alpha_convert(alpha_score)
 | |
|         energy = energy - energy.max()
 | |
|         energy_exp = paddle.exp(paddle.squeeze(energy, -1))
 | |
| 
 | |
|         if image_mask is not None:
 | |
|             energy_exp = energy_exp * paddle.squeeze(image_mask, 1)
 | |
|         alpha = energy_exp / (paddle.unsqueeze(
 | |
|             paddle.sum(paddle.sum(energy_exp, -1), -1), [1, 2]) + 1e-10)
 | |
|         alpha_sum = paddle.unsqueeze(alpha, 1) + alpha_sum
 | |
|         context_vector = paddle.sum(
 | |
|             paddle.sum((paddle.unsqueeze(alpha, 1) * cnn_features), -1), -1)
 | |
| 
 | |
|         return context_vector, alpha, alpha_sum
 | |
| 
 | |
| 
 | |
| class CANHead(nn.Layer):
 | |
|     def __init__(self, in_channel, out_channel, ratio, attdecoder, **kwargs):
 | |
|         super(CANHead, self).__init__()
 | |
| 
 | |
|         self.in_channel = in_channel
 | |
|         self.out_channel = out_channel
 | |
| 
 | |
|         self.counting_decoder1 = CountingDecoder(self.in_channel,
 | |
|                                                  self.out_channel, 3)  # mscm
 | |
|         self.counting_decoder2 = CountingDecoder(self.in_channel,
 | |
|                                                  self.out_channel, 5)
 | |
| 
 | |
|         self.decoder = AttDecoder(ratio, **attdecoder)
 | |
| 
 | |
|         self.ratio = ratio
 | |
| 
 | |
|     def forward(self, inputs, targets=None):
 | |
|         cnn_features, images_mask, labels = inputs
 | |
| 
 | |
|         counting_mask = images_mask[:, :, ::self.ratio, ::self.ratio]
 | |
|         counting_preds1, _ = self.counting_decoder1(cnn_features, counting_mask)
 | |
|         counting_preds2, _ = self.counting_decoder2(cnn_features, counting_mask)
 | |
|         counting_preds = (counting_preds1 + counting_preds2) / 2
 | |
| 
 | |
|         word_probs = self.decoder(cnn_features, labels, counting_preds,
 | |
|                                   images_mask)
 | |
|         return word_probs, counting_preds, counting_preds1, counting_preds2
 | 
