python验证码识别

Oct 7, 2019

图片源码下载

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#-*- coding:utf-8 -*-
from urllib.request import urlretrieve
import time, random, os

class Discuz():
def __init__(self):
# Discuz验证码生成图片地址
self.url = 'http://cuijiahua.com/tutrial/discuz/index.php?label=' #地址失效,请更换!!!

def random_captcha_text(self, captcha_size = 4):
"""
验证码一般都无视大小写;验证码长度4个字符
Parameters:
captcha_size:验证码长度
Returns:
captcha_text:验证码字符串
"""
number = ['0','1','2','3','4','5','6','7','8','9']
alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
char_set = number + alphabet
captcha_text = []
for i in range(captcha_size):
c = random.choice(char_set)
captcha_text.append(c)
captcha_text = ''.join(captcha_text)
return captcha_text

def download_discuz(self, nums = 50000):
"""
下载验证码图片
Parameters:
nums:下载的验证码图片数量
"""
dirname = './pic'
if dirname not in os.listdir():
os.mkdir(dirname)
for i in range(nums):
label = self.random_captcha_text()
print('第%d张图片:%s下载' % (i + 1,label))
urlretrieve(url = self.url + label, filename = dirname + '/' + label + '.jpg')
# 请至少加200ms延时,避免给我的服务器造成过多的压力,如发现影响服务器正常工作,我会关闭此功能。
# 你好我也好,大家好才是真的好!
time.sleep(0.1)
print('恭喜图片下载完成!')

if __name__ == '__main__':
dz = Discuz()
dz.download_discuz()

数据训练测试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#-*- coding:utf-8 -*-
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import os, random, cv2

class Discuz():
def __init__(self):
# 数据集路径
self.data_path = './pic/'
# 写到指定的磁盘路径中
self.log_dir = './logs/'
# 数据集图片大小
self.width = 30
self.heigth = 100
# 最大迭代次数
self.max_steps = 100
# 读取数据集
self.test_imgs, self.test_labels, self.train_imgs, self.train_labels = self.get_imgs()
# 训练集大小
self.train_size = len(self.train_imgs)
# 测试集大小
self.test_size = len(self.test_imgs)
# 每次获得batch_size大小的当前训练集指针
self.train_ptr = 0
# 每次获取batch_size大小的当前测试集指针
self.test_ptr = 0
# 字符字典大小:0-9 a-z A-Z _(验证码如果小于4,用_补齐) 一共63个字符
self.char_set_len = 63
# 验证码最长的长度为4
self.max_captcha = 4
# 输入数据X占位符
self.X = tf.placeholder(tf.float32, [None, self.heigth*self.width])
# 输入数据Y占位符
self.Y = tf.placeholder(tf.float32, [None, self.char_set_len*self.max_captcha])
# keepout占位符
self.keep_prob = tf.placeholder(tf.float32)


def get_imgs(self, rate = 0.2):
# 读取图片
imgs = os.listdir(self.data_path)
# 打乱图片顺序
random.shuffle(imgs)
# 数据集总共个数
imgs_num = len(imgs)
# 按照比例求出测试集个数
test_num = int(imgs_num * rate / (1 + rate))
# 测试集
test_imgs = imgs[:test_num]
# 根据文件名获取测试集标签
test_labels = list(map(lambda x: x.split('.')[0], test_imgs))
# 训练集
train_imgs = imgs[test_num:]
# 根据文件名获取训练集标签
train_labels = list(map(lambda x: x.split('.')[0], train_imgs))
return test_imgs, test_labels, train_imgs, train_labels


def get_next_batch(self, train_flag=True, batch_size=100):

# 从训练集获取数据
if train_flag == True:
if (batch_size + self.train_ptr) < self.train_size:
trains = self.train_imgs[self.train_ptr:(self.train_ptr + batch_size)]
labels = self.train_labels[self.train_ptr:(self.train_ptr + batch_size)]
self.train_ptr += batch_size
else:
new_ptr = (self.train_ptr + batch_size) % self.train_size
trains = self.train_imgs[self.train_ptr:] + self.train_imgs[:new_ptr]
labels = self.train_labels[self.train_ptr:] + self.train_labels[:new_ptr]
self.train_ptr = new_ptr

batch_x = np.zeros([batch_size, self.heigth*self.width])
batch_y = np.zeros([batch_size, self.max_captcha*self.char_set_len])

for index, train in enumerate(trains):
img = np.mean(cv2.imread(self.data_path + train), -1)
# 将多维降维1维
batch_x[index,:] = img.flatten() / 255
for index, label in enumerate(labels):
batch_y[index,:] = self.text2vec(label)

# 从测试集获取数据
else:
if (batch_size + self.test_ptr) < self.test_size:
tests = self.test_imgs[self.test_ptr:(self.test_ptr + batch_size)]
labels = self.test_labels[self.test_ptr:(self.test_ptr + batch_size)]
self.test_ptr += batch_size
else:
new_ptr = (self.test_ptr + batch_size) % self.test_size
tests = self.test_imgs[self.test_ptr:] + self.test_imgs[:new_ptr]
labels = self.test_labels[self.test_ptr:] + self.test_labels[:new_ptr]
self.test_ptr = new_ptr

batch_x = np.zeros([batch_size, self.heigth*self.width])
batch_y = np.zeros([batch_size, self.max_captcha*self.char_set_len])

for index, test in enumerate(tests):
img = np.mean(cv2.imread(self.data_path + test), -1)
# 将多维降维1维
batch_x[index,:] = img.flatten() / 255
for index, label in enumerate(labels):
batch_y[index,:] = self.text2vec(label)

return batch_x, batch_y

def text2vec(self, text):
"""
文本转向量
Parameters:
text:文本
Returns:
vector:向量
"""
if len(text) > 4:
raise ValueError('验证码最长4个字符')

vector = np.zeros(4 * self.char_set_len)
def char2pos(c):
if c =='_':
k = 62
return k
k = ord(c) - 48
if k > 9:
k = ord(c) - 55
if k > 35:
k = ord(c) - 61
if k > 61:
raise ValueError('No Map')
return k
for i, c in enumerate(text):
idx = i * self.char_set_len + char2pos(c)
vector[idx] = 1
return vector

def vec2text(self, vec):
"""
向量转文本
Parameters:
vec:向量
Returns:
文本
"""
char_pos = vec.nonzero()[0]
text = []
for i, c in enumerate(char_pos):
char_at_pos = i #c/63
char_idx = c % self.char_set_len
if char_idx < 10:
char_code = char_idx + ord('0')
elif char_idx < 36:
char_code = char_idx - 10 + ord('A')
elif char_idx < 62:
char_code = char_idx - 36 + ord('a')
elif char_idx == 62:
char_code = ord('_')
else:
raise ValueError('error')
text.append(chr(char_code))
return "".join(text)

def crack_captcha_cnn(self, w_alpha=0.01, b_alpha=0.1):

x = tf.reshape(self.X, shape=[-1, self.heigth, self.width, 1])
# 卷积的filter:一个Tensor。数据维度是四维[filter_height, filter_width, in_channels, out_channels]
# 具体含义是[卷积核的高度, 卷积核的宽度, 图像通道数, 卷积核个数]
w_c1 = tf.Variable(w_alpha*tf.random_normal([3, 3, 1, 32]))
b_c1 = tf.Variable(b_alpha*tf.random_normal([32]))
conv1 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(x, w_c1, strides=[1, 1, 1, 1], padding='SAME'), b_c1))

conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
w_c2 = tf.Variable(w_alpha*tf.random_normal([3, 3, 32, 64]))
b_c2 = tf.Variable(b_alpha*tf.random_normal([64]))
conv2 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(conv1, w_c2, strides=[1, 1, 1, 1], padding='SAME'), b_c2))

conv2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
w_c3 = tf.Variable(w_alpha*tf.random_normal([3, 3, 64, 64]))
b_c3 = tf.Variable(b_alpha*tf.random_normal([64]))

conv3 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(conv2, w_c3, strides=[1, 1, 1, 1], padding='SAME'), b_c3))
conv3 = tf.nn.max_pool(conv3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
w_d = tf.Variable(w_alpha*tf.random_normal([4*13*64, 1024]))
b_d = tf.Variable(b_alpha*tf.random_normal([1024]))

dense = tf.reshape(conv3, [-1, w_d.get_shape().as_list()[0]])
dense = tf.nn.relu(tf.add(tf.matmul(dense, w_d), b_d))
dense = tf.nn.dropout(dense, self.keep_prob)

w_out = tf.Variable(w_alpha*tf.random_normal([1024, self.max_captcha*self.char_set_len]))
b_out = tf.Variable(b_alpha*tf.random_normal([self.max_captcha*self.char_set_len]))
out = tf.add(tf.matmul(dense, w_out), b_out)
return out

def train_crack_captcha_cnn(self):

output = self.crack_captcha_cnn()
# 创建损失函数
diff = tf.nn.sigmoid_cross_entropy_with_logits(logits=output, labels=self.Y)
loss = tf.reduce_mean(diff)
tf.summary.scalar('loss', loss)

# 使用AdamOptimizer优化器训练模型,最小化交叉熵损失
optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)

# 计算准确率
y = tf.reshape(output, [-1, self.max_captcha, self.char_set_len])
y_ = tf.reshape(self.Y, [-1, self.max_captcha, self.char_set_len])
correct_pred = tf.equal(tf.argmax(y, 2), tf.argmax(y_, 2))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
tf.summary.scalar('accuracy', accuracy)

merged = tf.summary.merge_all()

with tf.Session() as sess:
# 写到指定的磁盘路径中
train_writer = tf.summary.FileWriter(self.log_dir + '/train', sess.graph)
test_writer = tf.summary.FileWriter(self.log_dir + '/test')
sess.run(tf.global_variables_initializer())

# 遍历self.max_steps次
for i in range(self.max_steps):
# 迭代500次,打乱一下数据集
if i % 20 == 0:
self.test_imgs, self.test_labels, self.train_imgs, self.train_labels = self.get_imgs()
# 每10次,使用测试集,测试一下准确率
if i % 10 == 0:
batch_x_test, batch_y_test = self.get_next_batch(False, 100)
summary, acc = sess.run([merged, accuracy], feed_dict={self.X: batch_x_test, self.Y: batch_y_test, self.keep_prob: 1})
print('迭代第%d次 accuracy:%f' % (i+1, acc))
test_writer.add_summary(summary, i)

# 如果准确率大于85%,则保存模型并退出。
if acc > 0.85:
train_writer.close()
test_writer.close()
break
# 一直训练
else:
batch_x, batch_y = self.get_next_batch(True, 100)
loss_value, _ = sess.run([loss, optimizer], feed_dict={self.X: batch_x, self.Y: batch_y, self.keep_prob: 1})
print('迭代第%d次 loss:%f' % (i+1, loss_value))
curve = sess.run(merged, feed_dict={self.X: batch_x_test, self.Y: batch_y_test, self.keep_prob: 1})
train_writer.add_summary(curve, i)

train_writer.close()
test_writer.close()


if __name__ == '__main__':
dz = Discuz()
dz.train_crack_captcha_cnn()