使用DFA算法对敏感词进行过滤,敏感词过滤器

时间:2019-10-07 14:53来源:编程技术
使用Decorator模式包装request对象实现敏感字符过滤功能。敏感词包括了:禁用词:反对共产党、色情。。。。审核词:中共(我家“中共”有三头猪)替换词:和谐社会河蟹社会 项目目

使用Decorator模式包装request对象实现敏感字符过滤功能。敏感词包括了:禁用词:反对共产党、色情。。。。审核词:中共(我家“中共”有三头猪)替换词:和谐社会 河蟹社会

项目目录结构如下:

将所有的敏感词汇的文本放在config目录下作为源目录。(例如:在IDEA里面将鼠标放在config目录上右击选择Mark Directory as,然后再选择Recources Root

图片 1

WordsFilter里面对敏感词进行过滤

其中resources资源目录中:

package cn.itcast.filter;import javax.servlet.*;import javax.servlet.http.HttpServletRequest;import javax.servlet.http.HttpServletRequestWrapper;import javax.servlet.http.HttpServletResponse;import java.io.*;import java.util.ArrayList;import java.util.Enumeration;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;/** * Created by yvettee on 2017/11/1. */public class WordsFilter implements Filter { private List<String> banWords = new ArrayList();//保存禁用词汇 private List<String> auditWords = new ArrayList();//保存审核词汇 private List<String> replaceWords = new ArrayList();//保存替换词汇 @Override public void init(FilterConfig filterConfig) throws ServletException { String path = WordsFilter.class.getClassLoader().getResource("cn/itcast/words").getPath(); File files[] = new File.listFiles(); for (File file : files) { if (!file.getName().endsWith { continue; } try { //文本是一行的,所以用BufferedReader InputStreamReader isr = new InputStreamReader(new FileInputStream, "UTF-8"); BufferedReader br = new BufferedReader; String line = null; while ((line = br.readLine != null) { /* 7大军区|3 双桨飞机|3 */ String s[] = line.split; if (s.length != 2) { continue; } if (s[1].trim().equals { banWords.add(s[0].trim; } if (s[1].trim().equals { auditWords.add(s[0].trim; } if (s[1].trim().equals { replaceWords.add(s[0].trim; } } } catch (Exception e) { throw new RuntimeException; } } } @Override public void doFilter(ServletRequest servletRequest, ServletResponse servletResponse, FilterChain filterChain) throws IOException, ServletException { HttpServletRequest request = (HttpServletRequest) servletRequest; HttpServletResponse response = (HttpServletResponse) servletResponse; //检查提交数据是否包含禁用词 Enumeration e = request.getParameterNames();//得到客户机提交过来的所有数据 while (e.hasMoreElements { String name =  e.nextElement(); String value = request.getParameter; //将每一个敏感词看做是一个正则表达式 for (String regex : banWords) { Pattern pattern = Pattern.compile;//编译表达式 //匹配器匹配 Matcher matcher = pattern.matcher; if (matcher.find { request.setAttribute("message", "文章中包含非法词汇,请检查后提交"); request.getRequestDispatcher("/message.jsp").forward(request, response); return; } } } //检查提交数据是否包含审核词,有就高亮显示 //检查替换词 filterChain.doFilter(new MyRequest, response); } class MyRequest extends HttpServletRequestWrapper { private HttpServletRequest request; public MyRequest(HttpServletRequest request) { super; this.request = request; } @Override public String getParameter(String name) { String data = this.request.getParameter; if (data == null) { return null; } for (String regex : auditWords) {//auditWords是审核词 Pattern p = Pattern.compile;//将每一个审核词作为正则表达式 Matcher m = p.matcher;//data是获取客户机传递过来的数据 if  { //我有一把仿真手枪,你要电鸡吗?? String value = m.group(); //找出客户机提交的数据中和正则表达式相匹配的数据 data = data.replaceAll(regex, "<font color='red'>" + value + "</font>"); } } for (String regex : replaceWords) { Pattern p = Pattern.compile; Matcher m = p.matcher; if  { //我有一把仿真手枪,你要电鸡吗?? data = data.replaceAll(regex, "*******"); } } return data; } } @Override public void destroy() { }}

stopwd.txt :停顿词,匹配时间直接过滤。

web.xml

wd.txt:敏感词库。

<filter> <filter-name>WordsFilter</filter-name> <filter-class>cn.itcast.filter.WordsFilter</filter-class> </filter> <filter-mapping> <filter-name>WordsFilter</filter-name> <url-pattern>/*</url-pattern> </filter-mapping>

1、WordFilter敏感词过滤类:

form.jsp页面里测试

图片 2图片 3

<form action="${pageContext.request.contextPath}/checkServlet" method="post"> <textarea rows="5" cols="50" name="resume"></textarea><br/> <input type="submit" value="提交"></form>
  1 package com.skyer.sensitivewdfilter;
  2 
  3 import java.io.BufferedReader;
  4 import java.io.IOException;
  5 import java.io.InputStreamReader;
  6 import java.util.ArrayList;
  7 import java.util.HashMap;
  8 import java.util.HashSet;
  9 import java.util.List;
 10 import java.util.Map;
 11 import java.util.Set;
 12 
 13 /**
 14  * 思路: 创建一个FilterSet,枚举了0~65535的所有char是否是某个敏感词开头的状态
 15  * 
 16  * 判断是否是 敏感词开头 | | 是 不是 获取头节点 OK--下一个字 然后逐级遍历,DFA算法
 17  */
 18 public class WordFilter {
 19 
 20     private static final FilterSet set = new FilterSet(); // 存储首字
 21     private static final Map<Integer, WordNode> nodes = new HashMap<Integer, WordNode>(1024, 1); // 存储节点
 22     private static final Set<Integer> stopwdSet = new HashSet<Integer>(); // 停顿词
 23     private static final char SIGN = '*'; // 敏感词过滤替换
 24 
 25     static {
 26         try {
 27             long a = System.nanoTime();
 28             init();
 29             a = System.nanoTime() - a;
 30             System.out.println("加载时间 : " + a + "ns");
 31             System.out.println("加载时间 : " + a / 1000000 + "ms");
 32         } catch (Exception e) {
 33             throw new RuntimeException("初始化过滤器失败");
 34         }
 35     }
 36 
 37     private static void init() {
 38         // 获取敏感词
 39         addSensitiveWord(readWordFromFile("wd.txt"));
 40         addStopWord(readWordFromFile("stopwd.txt"));
 41     }
 42 
 43     /**
 44      * 增加敏感词
 45      */
 46     private static List<String> readWordFromFile(String path) {
 47         List<String> words;
 48         BufferedReader br = null;
 49         try {
 50             br = new BufferedReader(new InputStreamReader(WordFilter.class.getClassLoader().getResourceAsStream(path)));
 51             words = new ArrayList<String>(1200);
 52             for (String buf = ""; (buf = br.readLine()) != null;) {
 53                 if (buf == null || buf.trim().equals(""))
 54                     continue;
 55                 words.add(buf);
 56             }
 57         } catch (Exception e) {
 58             throw new RuntimeException(e);
 59         } finally {
 60             try {
 61                 if (br != null)
 62                     br.close();
 63             } catch (IOException e) {
 64             }
 65         }
 66         return words;
 67     }
 68 
 69     /**
 70      * 增加停顿词
 71      */
 72     private static void addStopWord(final List<String> words) {
 73         if (words != null && words.size() > 0) {
 74             char[] chs;
 75             for (String curr : words) {
 76                 chs = curr.toCharArray();
 77                 for (char c : chs) {
 78                     stopwdSet.add(charConvert(c));
 79                 }
 80             }
 81         }
 82     }
 83 
 84     /**
 85      * 添加DFA节点
 86      */
 87     private static void addSensitiveWord(final List<String> words) {
 88         if (words != null && words.size() > 0) {
 89             char[] chs;
 90             int fchar;
 91             int lastIndex;
 92             WordNode fnode; // 首字母节点
 93             for (String curr : words) {
 94                 chs = curr.toCharArray();
 95                 fchar = charConvert(chs[0]);
 96                 if (!set.contains(fchar)) {// 没有首字定义
 97                     set.add(fchar);// 首字标志位 可重复add
 98                     fnode = new WordNode(fchar, chs.length == 1);
 99                     nodes.put(fchar, fnode);
100                 } else {
101                     fnode = nodes.get(fchar);
102                     if (!fnode.isLast() && chs.length == 1)
103                         fnode.setLast(true);
104                 }
105                 lastIndex = chs.length - 1;
106                 for (int i = 1; i < chs.length; i++) {
107                     fnode = fnode.addIfNoExist(charConvert(chs[i]), i == lastIndex);
108                 }
109             }
110         }
111     }
112 
113     /**
114      * 过滤判断 将敏感词转化为成屏蔽词
115      */
116     public static final String doFilter(final String src) {
117         char[] chs = src.toCharArray();
118         int length = chs.length;
119         int currc;
120         int k;
121         WordNode node;
122         for (int i = 0; i < length; i++) {
123             currc = charConvert(chs[i]);
124             if (!set.contains(currc)) {
125                 continue;
126             }
127             node = nodes.get(currc);
128             if (node == null)
129                 continue;
130             boolean couldMark = false;
131             int markNum = -1;
132             if (node.isLast()) {
133                 couldMark = true;
134                 markNum = 0;
135             }
136             k = i;
137             for (; ++k < length;) {
138                 int temp = charConvert(chs[k]);
139                 if (stopwdSet.contains(temp))
140                     continue;
141                 node = node.querySub(temp);
142                 if (node == null)
143                     break;
144                 if (node.isLast()) {
145                     couldMark = true;
146                     markNum = k - i;
147                 }
148             }
149             if (couldMark) {
150                 for (k = 0; k <= markNum; k++) {
151                     chs[k + i] = SIGN;
152                 }
153                 i = i + markNum;
154             }
155         }
156 
157         return new String(chs);
158     }
159 
160     /**
161      * 是否包含敏感词
162      */
163     public static final boolean isContains(final String src) {
164         char[] chs = src.toCharArray();
165         int length = chs.length;
166         int currc;
167         int k;
168         WordNode node;
169         for (int i = 0; i < length; i++) {
170             currc = charConvert(chs[i]);
171             if (!set.contains(currc)) {
172                 continue;
173             }
174             node = nodes.get(currc);
175             if (node == null)
176                 continue;
177             boolean couldMark = false;
178             if (node.isLast()) {
179                 couldMark = true;
180             }
181             k = i;
182             for (; ++k < length;) {
183                 int temp = charConvert(chs[k]);
184                 if (stopwdSet.contains(temp))
185                     continue;
186                 node = node.querySub(temp);
187                 if (node == null)
188                     break;
189                 if (node.isLast()) {
190                     couldMark = true;
191                 }
192             }
193             if (couldMark) {
194                 return true;
195             }
196         }
197 
198         return false;
199     }
200 
201     /**
202      * 大写转化为小写 全角转化为半角
203      */
204     private static int charConvert(char src) {
205         int r = BCConvert.qj2bj(src);
206         return (r >= 'A' && r <= 'Z') ? r + 32 : r;
207     }
208 
209 }

源代码:

WordFilter.java

其中:

      isContains :是否包含敏感词

     doFilter:过滤敏感词

2、WordNode敏感词节点:

图片 4图片 5

 1 package com.skyer.sensitivewdfilter;
 2 
 3 import java.util.LinkedList;
 4 import java.util.List;
 5 
 6 public class WordNode {
 7 
 8     private int value; // 节点名称
 9 
10     private List<WordNode> subNodes; // 子节点
11 
12     private boolean isLast; // 默认false
13 
14     public WordNode(int value) {
15         this.value = value;
16     }
17 
18     public WordNode(int value, boolean isLast) {
19         this.value = value;
20         this.isLast = isLast;
21     }
22 
23     /**
24      * @return 就是传入的subNode
25      */
26     private WordNode addSubNode(final WordNode subNode) {
27         if (subNodes == null)
28             subNodes = new LinkedList<WordNode>();
29         subNodes.add(subNode);
30         return subNode;
31     }
32 
33     /**
34      * 有就直接返回该子节点, 没有就创建添加并返回该子节点
35      */
36     public WordNode addIfNoExist(final int value, final boolean isLast) {
37         if (subNodes == null) {
38             return addSubNode(new WordNode(value, isLast));
39         }
40         for (WordNode subNode : subNodes) {
41             if (subNode.value == value) {
42                 if (!subNode.isLast && isLast)
43                     subNode.isLast = true;
44                 return subNode;
45             }
46         }
47         return addSubNode(new WordNode(value, isLast));
48     }
49 
50     public WordNode querySub(final int value) {
51         if (subNodes == null) {
52             return null;
53         }
54         for (WordNode subNode : subNodes) {
55             if (subNode.value == value)
56                 return subNode;
57         }
58         return null;
59     }
60 
61     public boolean isLast() {
62         return isLast;
63     }
64 
65     public void setLast(boolean isLast) {
66         this.isLast = isLast;
67     }
68 
69     @Override
70     public int hashCode() {
71         return value;
72     }
73 
74 }

WordNode.java

3、测试类:

图片 6图片 7

 1 package com.skyer.test;
 2 
 3 import org.junit.Test;
 4 
 5 import com.skyer.sensitivewdfilter.WordFilter;
 6 
 7 public class TestSensitivewd {
 8 
 9     @Test
10     public void TestFilter() {
11         String s = ""; // 这里写你要过滤的句子(我这里不能写,否则会给博客园屏蔽掉)
12         System.out.println("解析问题: " + s);
13         System.out.println("解析字数 : " + s.length());
14         String re;
15         long nano = System.nanoTime();
16         re = WordFilter.doFilter(s);
17         nano = (System.nanoTime() - nano);
18         System.out.println("解析时间 : " + nano + "ns");
19         System.out.println("解析时间 : " + nano / 1000000 + "ms");
20         System.out.println(re);
21         System.out.println();
22 
23         nano = System.nanoTime();
24         System.out.println("是否包含敏感词: " + WordFilter.isContains(s));
25         nano = (System.nanoTime() - nano);
26         System.out.println("解析时间 : " + nano + "ns");
27         System.out.println("解析时间 : " + nano / 1000000 + "ms");
28     }
29 
30 }

TestSensitivewd.java

4、测试结果:

图片 8

原文参考:

DFA知识:

编辑:编程技术 本文来源:使用DFA算法对敏感词进行过滤,敏感词过滤器

关键词: