1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
| class VGGnet_train(Network): # 继承自NetWork,关与NetWork可以看这里:https://github.com/xiaofengShi/CHINESE-OCR/blob/master/ctpn/lib/networks/network.py def __init__(self, trainable=True): self.inputs = [] self.data = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='data') self.im_info = tf.placeholder(tf.float32, shape=[None, 3], name='im_info') self.gt_boxes = tf.placeholder(tf.float32, shape=[None, 5], name='gt_boxes') self.gt_ishard = tf.placeholder(tf.int32, shape=[None], name='gt_ishard') self.dontcare_areas = tf.placeholder(tf.float32, shape=[None, 4], name='dontcare_areas') self.keep_prob = tf.placeholder(tf.float32) self.layers = dict({'data': self.data, 'im_info': self.im_info, 'gt_boxes': self.gt_boxes,'gt_ishard': self.gt_ishard, 'dontcare_areas': self.dontcare_areas}) self.trainable = trainable self.setup() def setup(self): # 对于文本提议来说,类别为2,一类为为文字部分,另一类为背景 n_classes = cfg.NCLASSES # anchor的初始尺寸,论文中使用的是16 anchor_scales = cfg.ANCHOR_SCALES _feat_stride = [16, ] # base net is vgg16 # 内部使用的函数 (self.feed('data') .conv(3, 3, 64, 1, 1, name='conv1_1') .conv(3, 3, 64, 1, 1, name='conv1_2') .max_pool(2, 2, 2, 2, padding='VALID', name='pool1') .conv(3, 3, 128, 1, 1, name='conv2_1') .conv(3, 3, 128, 1, 1, name='conv2_2') .max_pool(2, 2, 2, 2, padding='VALID', name='pool2') .conv(3, 3, 256, 1, 1, name='conv3_1') .conv(3, 3, 256, 1, 1, name='conv3_2') .conv(3, 3, 256, 1, 1, name='conv3_3') .max_pool(2, 2, 2, 2, padding='VALID', name='pool3') .conv(3, 3, 512, 1, 1, name='conv4_1') .conv(3, 3, 512, 1, 1, name='conv4_2') .conv(3, 3, 512, 1, 1, name='conv4_3') .max_pool(2, 2, 2, 2, padding='VALID', name='pool4') .conv(3, 3, 512, 1, 1, name='conv5_1') .conv(3, 3, 512, 1, 1, name='conv5_2') .conv(3, 3, 512, 1, 1, name='conv5_3')) # RPN # 该层对上层的feature map进行卷积,生成512通道的的feature map (self.feed('conv5_3').conv(3, 3, 512, 1, 1, name='rpn_conv/3x3')) # 卷积最后一层的的feature_map尺寸为batch*h*w*512 # 原来的单层双向LSTM (self.feed('rpn_conv/3x3').Bilstm(512, 128, 512, name='lstm_o')) # bilstm之后输出的尺寸为(N, H, W, 512) """ 和faster—rcnn相似,在ctpn的rpn网络中,使用双向lstm和全连接得到预测的 目标概率和回归框,在faster-rcnn中使用的是卷积的方式从basenet的最后一层生成 使用LSTM的输出来计算位置偏移和类别概率(判断是否是物体,不判断类别的种类) 输入尺寸为(N, H, W, 512) 输出尺寸(N, H, W, int(d_o)) 可以将这一层当做目标检测中的最后一层feature_map rpn_bbox_pred--对于h*w的尺寸上,每一anchor上生成4个位置偏移量 rpn_cls_score--对于h*w的尺寸上,每一anchor上生成2个置信度得分,判断是否为物体 """ (self.feed('lstm_o').lstm_fc(512, len(anchor_scales) * 10 * 4, name='rpn_bbox_pred')) (self.feed('lstm_o').lstm_fc(512, len(anchor_scales) * 10 * 2, name='rpn_cls_score'))
|