Java|java识别图片验证码

之前在进行selenium自动化测试时需要对项目内的验证码进行识别，通常有三种方法进行验证码处理：
去除验证码
万能验证码
自动识别
但由于部分数据未提供了api，我们只能通过自动识别方式去进行“破解”

鄙人使用两种开源技术进行尝试：tess4j和tesseract-ocr(OCR)

test4J方式识别验证码

1.下载tessdata和各种训练语言包

下载tessdata:
github下载tesseract中的tessdata文件夹即可，
下载地址：https://github.com/tesseract-ocr/tesseract/tree/main
存放位置：
在这里插入图片描述下载训练语言包：
tessdata支持多语言类型的验证码，比如英文数字类型的验证码对应的源程序为eng.traineddata
下载链接：https://github.com/tesseract-ocr/tessdata
存放位置：
放在上面下载的tessdata文件夹中

最快捷的方式，使用鄙人整理好的文件，下载地址：

2.加入maven依赖


            net.java.dev.jna
            jna
            4.2.1
        
    
        net.sourceforge.tess4j
        tess4j
        4.5.1
    
1
2
3
4
5
6
7
8
9
10

3.编写代码

public class TestImgVer {

    public static void main(String[] args) {
        String dataPath = "tessdata";
        String picturePath = "src/test/resources/3esg.png";
        System.out.println(baseVerCode(dataPath,picturePath));
    }
    //无干扰项的字母数字图片验证码识别
    public static String baseVerCode(String dataPath,String picturePath){
        String result = null;
        Tesseract tesseract = new Tesseract();
        tesseract.setDatapath(dataPath); // 设置tessdata文件夹的路径
        // 其他配置，如语言、OCR引擎等
        try {
            result = tesseract.doOCR(new File(picturePath)); // 识别图片
          //  System.out.println(result);
        } catch (TesseractException e) {
            e.printStackTrace();
        }
    return  result;
    }
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

4.结果验证

识别的图片：
在这里插入图片描述
运行结果

如果使用带有干扰线等干扰项的验证码时进行识别效果如下：

运行结果：

这时我们可以使用第二种方式tesseract-ocr进行识别，tesseract-ocr在tess4j的基础上，增加了对验证码去噪点、二值化等操作

tesseract-ocr方式识别验证码

1.安装tesseract-ocr

文章链接：http://t.csdn.cn/8lfjY

2.加入maven依赖


        
            net.java.dev.jna
            jna
            4.2.1
        
    
        net.sourceforge.tess4j
        tess4j
        4.5.1
    
    
        org.openpnp
        opencv
        3.2.0-1
    
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

3.带干扰项验证码处理（去噪、二值化等操作）

    public static void main(String[] args) throws IOException
   {
       File file = new File("src/test/resources/kaptcha.jpg");
       final String destDir = file.getParent()+"\\tmp";
       cleanLinesInImage(file, destDir);
       cleanLinesInImage(file, destDir);
       cleanLinesInImage(file, destDir);
   }
   /**
    *
    * @param sfile
    *            需要去噪的图像
    * @param destDir
    *            去噪后的图像保存地址
    * @throws IOException
    */
   public static void cleanLinesInImage(File sfile, String destDir)  throws IOException{
       File destF = new File(destDir);
       if (!destF.exists())
       {
           destF.mkdirs();
       }

       BufferedImage bufferedImage = ImageIO.read(sfile);
       int h = bufferedImage.getHeight();
       int w = bufferedImage.getWidth();

       // 灰度化
       int[][] gray = new int[w][h];
       for (int x = 0; x < w; x++)
       {
           for (int y = 0; y < h; y++)
           {
               int argb = bufferedImage.getRGB(x, y);
               // 图像加亮（调整亮度识别率非常高）
               int r = (int) (((argb >> 16) & 0xFF) * 1.1 + 30);
               int g = (int) (((argb >> 8) & 0xFF) * 1.1 + 30);
               int b = (int) (((argb >> 0) & 0xFF) * 1.1 + 30);
               if (r >= 255)
               {
                   r = 255;
               }
               if (g >= 255)
               {
                   g = 255;
               }
               if (b >= 255)
               {
                   b = 255;
               }
               gray[x][y] = (int) Math
                       .pow((Math.pow(r, 2.2) * 0.2973 + Math.pow(g, 2.2)
                               * 0.6274 + Math.pow(b, 2.2) * 0.0753), 1 / 2.2);
           }
       }

       // 二值化
       int threshold = ostu(gray, w, h);
       BufferedImage binaryBufferedImage = new BufferedImage(w, h, BufferedImage.TYPE_BYTE_BINARY);
       for (int x = 0; x < w; x++)
       {
           for (int y = 0; y < h; y++)
           {
               if (gray[x][y] > threshold)
               {
                   gray[x][y] |= 0x00FFFF;
               } else
               {
                   gray[x][y] &= 0xFF0000;
               }
               binaryBufferedImage.setRGB(x, y, gray[x][y]);
           }
       }

       //去除干扰线条
       for(int y = 1; y < h-1; y++){
           for(int x = 1; x < w-1; x++){
               boolean flag = false ;
               if(isBlack(binaryBufferedImage.getRGB(x, y))){
                   //左右均为空时，去掉此点
                   if(isWhite(binaryBufferedImage.getRGB(x-1, y)) && isWhite(binaryBufferedImage.getRGB(x+1, y))){
                       flag = true;
                   }
                   //上下均为空时，去掉此点
                   if(isWhite(binaryBufferedImage.getRGB(x, y+1)) && isWhite(binaryBufferedImage.getRGB(x, y-1))){
                       flag = true;
                   }
                   //斜上下为空时，去掉此点
                   if(isWhite(binaryBufferedImage.getRGB(x-1, y+1)) && isWhite(binaryBufferedImage.getRGB(x+1, y-1))){
                       flag = true;
                   }
                   if(isWhite(binaryBufferedImage.getRGB(x+1, y+1)) && isWhite(binaryBufferedImage.getRGB(x-1, y-1))){
                       flag = true;
                   }
                   if(flag){
                       binaryBufferedImage.setRGB(x,y,-1);
                   }
               }
           }
       }


       // 矩阵打印
       for (int y = 0; y < h; y++)
       {
           for (int x = 0; x < w; x++)
           {
               if (isBlack(binaryBufferedImage.getRGB(x, y)))
               {
                   System.out.print("*");
               } else
               {
                   System.out.print(" ");
               }
           }
           System.out.println();
       }

       ImageIO.write(binaryBufferedImage, "jpg", new File(destDir, sfile
               .getName()));
   }

   public static boolean isBlack(int colorInt)
   {
       Color color = new Color(colorInt);
       if (color.getRed() + color.getGreen() + color.getBlue() <= 300)
       {
           return true;
       }
       return false;
   }

   public static boolean isWhite(int colorInt)
   {
       Color color = new Color(colorInt);
       if (color.getRed() + color.getGreen() + color.getBlue() > 300)
       {
           return true;
       }
       return false;
   }

   public static int isBlackOrWhite(int colorInt)
   {
       if (getColorBright(colorInt) < 30 || getColorBright(colorInt) > 730)
       {
           return 1;
       }
       return 0;
   }

   public static int getColorBright(int colorInt)
   {
       Color color = new Color(colorInt);
       return color.getRed() + color.getGreen() + color.getBlue();
   }

   public static int ostu(int[][] gray, int w, int h)
   {
       int[] histData = new int[w * h];
       // Calculate histogram
       for (int x = 0; x < w; x++)
       {
           for (int y = 0; y < h; y++)
           {
               int red = 0xFF & gray[x][y];
               histData[red]++;
           }
       }

       // Total number of pixels
       int total = w * h;

       float sum = 0;
       for (int t = 0; t < 256; t++)
           sum += t * histData[t];

       float sumB = 0;
       int wB = 0;
       int wF = 0;

       float varMax = 0;
       int threshold = 0;

       for (int t = 0; t < 256; t++)
       {
           wB += histData[t]; // Weight Background
           if (wB == 0)
               continue;

           wF = total - wB; // Weight Foreground
           if (wF == 0)
               break;

           sumB += (float) (t * histData[t]);

           float mB = sumB / wB; // Mean Background
           float mF = (sum - sumB) / wF; // Mean Foreground

           // Calculate Between Class Variance
           float varBetween = (float) wB * (float) wF * (mB - mF) * (mB - mF);

           // Check if new maximum found
           if (varBetween > varMax)
           {
               varMax = varBetween;
               threshold = t;
           }
       }

       return threshold;
   }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212

tesseract-ocr方式识别存在的问题：若验证码干扰元素过多，则处理后的验证码缺失点过多，导致验证码识别结果存在偏差，查阅其他资料发现使用python脚本进行识别处理结果的可信度远远高于以上两种方法
Python使用OCR技术识别验证码后续更新

test4J方式识别验证码

1.下载tessdata和各种训练语言包

2.加入maven依赖

3.编写代码

4.结果验证

tesseract-ocr方式识别验证码

1.安装tesseract-ocr

2.加入maven依赖

3.带干扰项验证码处理（去噪、二值化等操作）

评论记录：