介绍
本文使用字典树+前缀表,实现敏感词过滤功能。字典树存储敏感词库,前缀表加速敏感词的匹配。
基本字典树节点
/**
* 敏感词字典树节点
*/
@Getter
@Setter
public class WordDataTreeNode implements Serializable{
/**
* 关键词结束标识
*/
private boolean end;
/**
* 父节点
*/
WordDataTreeNode father;
/**
* 子节点(key是下级字符,value是下级节点) 多叉树
*/
private Map<Character, WordDataTreeNode> subNodeMap;
/**
* 回溯前缀
*/
WordDataTreeNode prefix;
/**
* 当前字符
*/
Character charNow;
public boolean end() {
return end;
}
public WordDataTreeNode end(boolean end) {
this.end = end;
return this;
}
public WordDataTreeNode getSubNode(final char c) {
if(subNodeMap == null) {
return null;
}
return subNodeMap.get(c);
}
public WordDataTreeNode addSubNode(char c, WordDataTreeNode subNode) {
if(this.subNodeMap == null) {
subNodeMap = new HashMap();
}
subNodeMap.put(c, subNode);
return this;
}
}
字典树实现+前缀表
/**
* 敏感词字典树 利用map
*/
@Data
public class WordDataTree implements Serializable{
/**
* 根节点
*/
private WordDataTreeNode root;
/**
*字典树大小
*/
private Long treeSize;
public void readTxtFileWithInit(String filePath) throws IOException {
Collection<String> words = new ArrayList();
int count = 0;
BufferedReader reader = new BufferedReader(new FileReader(filePath, StandardCharsets.UTF_8));
try{
String line;
while ((line = reader.readLine()) != null) {
count++;
if(count > 5000){
break;
}
// 按行读取,将每一行作为一个字符串添加到集合中
words.add(line);
}
} catch (IOException e) {
e.printStackTrace();
}
initWordData(words);
}
/**
* 初始化字典树
*/
public synchronized void initWordData(Collection<String> collection) {
WordDataTreeNode newRoot = new WordDataTreeNode();
//遍历每个敏感词,去构建字典树(多叉)
for(String word : collection) {
if(word == null || "".equals(word)) {
continue;
}
WordDataTreeNode tempNode = newRoot;
char[] chars = word.toCharArray();
for (char c : chars) {
// 获取子节点
WordDataTreeNode subNode = tempNode.getSubNode(c);
if (subNode == null) {
subNode = new WordDataTreeNode();
// 加入新的子节点
tempNode.addSubNode(c, subNode);
}
subNode.setFather(tempNode);
subNode.setCharNow(c);
// 临时节点指向子节点,进入下一次循环
tempNode = subNode;
}
// 设置结束标识(循环结束,设置一次即可)
tempNode.end(true);
}
// 初始化完成才做替换
this.root = newRoot;
this.root.setPrefix(newRoot);
//初始化前缀表,方便模式串的匹配
initPrefix();
}
/**
* 初始化字典树的前缀表
*/
private synchronized void initPrefix(){
Map<Character, WordDataTreeNode> rootMap = root.getSubNodeMap();
Set<Map.Entry<Character, WordDataTreeNode>> entries = rootMap.entrySet();
//初始化默认回溯节点root
for(Map.Entry<Character, WordDataTreeNode> entry : entries){
entry.getValue().setPrefix(root);
}
//循环挨个路线递归
for(Map.Entry<Character, WordDataTreeNode> entry : entries){
initPrefixBack(root, getQueue(entry.getValue()));
}
}
/**
* 递归调用创建前缀表,顺便获取字典树最终占用空间的大小
*/
private synchronized void initPrefixBack(WordDataTreeNode i, Queue<WordDataTreeNode> queue){
//递归结束标志
if(i == null || queue == null || queue.size() < 1) return;
//记录整个字典树占用的内存大小
Long treeSize = 0l;
//独立逻辑,累加大小(字节为单位)
for (WordDataTreeNode estimateSize : queue){
treeSize += SerializationUtils.serialize(estimateSize).length;
}
this.treeSize = treeSize;
//记录下来,方便复原
WordDataTreeNode storeI = i;
//传统的构建前缀表形式 + 递归(回溯)
for(WordDataTreeNode node : queue){
while(i.getSubNodeMap() == null
|| (i != root
&& i.getSubNodeMap() != null
&& ! i.getSubNodeMap()
.containsKey(node.getCharNow()))){
i = i.getPrefix();
if(i == null) i = root;
}
if(i.getSubNodeMap().containsKey(node.getCharNow())){
i = i.getSubNodeMap().get(node.getCharNow());
}
node.setPrefix(i);
//递归
initPrefixBack(i, getQueue(node));
//要回溯:复原,因为下一个循环路径不同
i = storeI;
}
}
/**
* 获取节点子树中下一层所有节点构成的队列
*/
private Queue<WordDataTreeNode> getQueue(WordDataTreeNode node){
Map<Character, WordDataTreeNode> rootMap = node.getSubNodeMap();
if (rootMap == null){
return null;
}
Set<Map.Entry<Character, WordDataTreeNode>> entries = rootMap.entrySet();
Queue<WordDataTreeNode> queue = new LinkedList();
for(Map.Entry<Character, WordDataTreeNode> entry : entries){
queue.add(entry.getValue());
}
return queue;
}
public List<String> containsFindAll(StringBuilder stringBuilder) {
//判断是否已完成初始化
if(this.root == null || root.getSubNodeMap().size() < 1) {
return new ArrayList();
}
//System.out.println(stringBuilder);查看检验文本
//已完成初始化,去做模式串匹配
WordDataTreeNode nowNode = this.root;
//初始化局部变量
List<String> result = new ArrayList();
int len = stringBuilder.length();
int sensitiveWordLen = 0;
//遍历文本
for(int i = 0; i < len; i++) {
//对文本字符串去重,默认关闭
if(false && nowNode != root && i > 0) {
char preMappingChar = stringBuilder.charAt(i-1);
char mappingChar = stringBuilder.charAt(i);
if(preMappingChar == mappingChar) {
sensitiveWordLen = nowNode.charNow == mappingChar? sensitiveWordLen + 1:sensitiveWordLen;
continue;
}
}
//判断字符是否匹配,不匹配则根据匹配的前缀回退
while (nowNode.getSubNodeMap() == null
|| (nowNode != root
&& !nowNode.getSubNodeMap()
.containsKey(stringBuilder.charAt(i)))){
nowNode = nowNode.prefix;
}
if(nowNode.getSubNodeMap().containsKey(stringBuilder.charAt(i))) {
sensitiveWordLen++;
nowNode = nowNode.getSubNodeMap().get(stringBuilder.charAt(i));
}else {
sensitiveWordLen = 0;
}
//判断是否为结尾,是结尾则加入结果集并重置敏感词长度为 0
if(nowNode.end()) {
String sWord = stringBuilder.substring(i - sensitiveWordLen + 1, i + 1);
sensitiveWordLen = 0;
result.add(sWord);
}
}
return result;
}
}
有待完成的:
支持正则化的匹配?