违禁词过滤器 DFA算法 升级版

471 阅读2分钟

敏感词|屏蔽字过滤器 DFA算法 升级版增加严格模式强力去除敏感词

该算法经测试在 13993 个敏感词中过滤 26 个字符的文字耗时为:0 ms

Snipaste_2021-10-29_17-43-59.png

 
package test1;
 
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
 
/**
 * 敏感字节点
 */
class WordNode {
    /*是否敏感词尾*/
    public boolean isEnd;
    /*父节点*/
    public WordNode parentNode;
    /*子节点*/
    public Map<String, WordNode> children = new HashMap<String, WordNode>();
    /*当前词符*/
    public String value = "";
    
    public WordNode getChild(String name){
        return this.children.get(name);
    }
    
    public WordNode addChild(String name) {
        WordNode node = new WordNode();
        node.parentNode = this;
        this.children.put(name, node);
        return node;
    }
}
 
/**
 * DFA 算法对象
 */
public class DFAWords {
    /*树根*/
    public WordNode treeRoot = new WordNode();
    
    /**
    * 在算法树上添加敏感词树枝和树叶
    * @param words
    */
    public void addWords(String[] words) {
        for(String word: words) {
            int wordLength = word.length();
            if(wordLength > 0) {
                WordNode node = this.treeRoot;
                for(String name: word.split("")) {
                    WordNode temp = node.getChild(name);
                    if(temp != null) {
                        node = temp;
                    }
                    else {
                        node = node.addChild(name);
                    }
                }
                node.isEnd = true;
            }
        }
    }
    
    /**
    * 搜索敏感词
    * @param word 要过滤的字符串
    * @param strict 严格模式:使用严格模式会将那些没有连在一起的词组也过滤掉,强力去除所有的敏感词
    * @return
    */
    public List<Integer> searchWords(String word, boolean strict){
        List<Integer> words = new ArrayList<Integer>();
        String[] chars = word.split("");  过滤字符串数组
        // 确保敏感词索引树有内容
        if(chars.length > 0 && this.treeRoot != null) {
            WordNode node = this.treeRoot;
            List<Integer> vwords = new ArrayList<Integer>(); // 找到的每个单词
            List<Integer> alone = new ArrayList<Integer>(); // 找到开头没有结尾的单词
            String str = ""; // 每一个需要匹配的字符
            WordNode chilhNode = null; // 找到的节点
            boolean complete = false; // 是否全部搜索完毕
            // 先找到连着的
            for (int i = 0; i < chars.length; i++) {
                str = chars[i];
                chilhNode = node.getChild(str);
                if(chilhNode == null) {
                    //重新开始下个敏感词检测
                    node = this.treeRoot;
                    vwords.clear();
                }
                chilhNode = node.getChild(str);
                if(chilhNode != null) {
                    node = chilhNode;
                    vwords.add(i);
                    if(chilhNode.isEnd) {
                        words.addAll(vwords);
                        vwords.clear();
                    }
                }
            }
            // 再找到所有散落的
            if(strict) do {
                vwords.clear();
                node = this.treeRoot;
                for (int i = 0; i < chars.length; i++) {
                    if(words.contains(i) || alone.contains(i)) continue; // 排除已经找到了的和只有一部分构不成整个屏蔽词的
                    str = chars[i];
                    chilhNode = node.getChild(str);
                    if(chilhNode != null) {
                        node = chilhNode;
                        vwords.add(i);
                        if(chilhNode.isEnd) {
                            words.addAll(vwords);
                            node = this.treeRoot;
                        }
                    }
                }
                if(node.isEnd == false) {
                    alone.addAll(vwords);
                }
                complete = vwords.size() == 0;
            } while (complete == false);
        }
        return words;
    }
    
    public String filter(String word) {
        return this.filter(word, "*", true);
    }
    
    public String filter(String word, boolean strict) {
        return this.filter(word, "*", strict);
    }
    
    public String filter(String word, String repStr, boolean strict) {
        StringBuffer words = new StringBuffer(word);
        for (int i : this.searchWords(word,strict)) {
            words.replace(i, i + 1, repStr);
        }
        return words.toString();
    }
    
    
    public static void main(String[] args) {
        try {
              
            String str = "你特风台八好你路湾台湾是花码热人死了以后再通知我们吧好";
            String textString = new String(Files.readAllBytes(Paths.get("D:\\mgck2017\\key.txt")), "UTF-8");
            String[] words = textString.split("\\|");
            DFAWords dfa = new DFAWords();
            long time = System.currentTimeMillis();
            dfa.addWords(words);
            dfa.addWords(new String[] { "你好" });
            System.out.println(String.format("建树耗时:%dms, 敏感词库:%d个词", System.currentTimeMillis() - time, words.length));
            time = System.currentTimeMillis();
            System.out.println(String.format("敏感词:你好|台湾|风花|特码|人死了以后再通知我们吧\n过滤文字:%s", str));
            System.out.println(String.format("强力过滤【%s】耗时:%dms", dfa.filter(str), System.currentTimeMillis() - time));
            time = System.currentTimeMillis();
            System.out.println(String.format("非强力过滤【%s】耗时:%dms", dfa.filter(str, false), System.currentTimeMillis() - time));
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    
}