Java实现敏感词过滤 —— 字典树+前缀表

88 阅读3分钟

介绍

本文使用字典树+前缀表,实现敏感词过滤功能。字典树存储敏感词库,前缀表加速敏感词的匹配。

基本字典树节点

/**
 * 敏感词字典树节点
 */
@Getter
@Setter
public class WordDataTreeNode implements Serializable{

    /**
     * 关键词结束标识
     */
    private boolean end;

    /**
     * 父节点
     */
    WordDataTreeNode father;

    /**
     * 子节点(key是下级字符,value是下级节点) 多叉树
     */
    private Map<Character, WordDataTreeNode> subNodeMap;

    /**
     * 回溯前缀
     */
    WordDataTreeNode prefix;

    /**
     * 当前字符
     */
    Character charNow;

    public boolean end() {
        return end;
    }

    public WordDataTreeNode end(boolean end) {
        this.end = end;
        return this;
    }

    public WordDataTreeNode getSubNode(final char c) {
        if(subNodeMap == null) {
            return null;
        }

        return subNodeMap.get(c);
    }

    public WordDataTreeNode addSubNode(char c, WordDataTreeNode subNode) {
        if(this.subNodeMap == null) {
            subNodeMap = new HashMap();
        }

        subNodeMap.put(c, subNode);
        return this;
    }

}

字典树实现+前缀表

/**
 * 敏感词字典树  利用map
 */
@Data
public class WordDataTree implements Serializable{

    /**
     * 根节点
     */
    private WordDataTreeNode root;

    /**
     *字典树大小
     */
    private Long treeSize;

    public void readTxtFileWithInit(String filePath) throws IOException {
        Collection<String> words = new ArrayList();
        int count = 0;
        BufferedReader reader = new BufferedReader(new FileReader(filePath, StandardCharsets.UTF_8));
        try{
            String line;
            while ((line = reader.readLine()) != null) {
                count++;
                if(count > 5000){
                    break;
                }
                // 按行读取,将每一行作为一个字符串添加到集合中
                words.add(line);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        initWordData(words);
    }


    /**
     * 初始化字典树
     */
    public synchronized void initWordData(Collection<String> collection) {
        WordDataTreeNode newRoot = new WordDataTreeNode();

        //遍历每个敏感词,去构建字典树(多叉)
        for(String word : collection) {

            if(word == null || "".equals(word)) {
                continue;
            }

            WordDataTreeNode tempNode = newRoot;
            char[] chars = word.toCharArray();
            for (char c : chars) {
                // 获取子节点
                WordDataTreeNode subNode = tempNode.getSubNode(c);
                if (subNode == null) {
                    subNode = new WordDataTreeNode();
                    // 加入新的子节点
                    tempNode.addSubNode(c, subNode);
                }
                subNode.setFather(tempNode);
                subNode.setCharNow(c);

                // 临时节点指向子节点,进入下一次循环
                tempNode = subNode;
            }

            // 设置结束标识(循环结束,设置一次即可)
            tempNode.end(true);

        }

        // 初始化完成才做替换
        this.root = newRoot;
        this.root.setPrefix(newRoot);
        //初始化前缀表,方便模式串的匹配
        initPrefix();
    }

    /**
     * 初始化字典树的前缀表
     */
    private synchronized void initPrefix(){
        Map<Character, WordDataTreeNode> rootMap = root.getSubNodeMap();
        Set<Map.Entry<Character, WordDataTreeNode>> entries = rootMap.entrySet();

        //初始化默认回溯节点root
        for(Map.Entry<Character, WordDataTreeNode> entry : entries){
            entry.getValue().setPrefix(root);
        }
        //循环挨个路线递归
        for(Map.Entry<Character, WordDataTreeNode> entry : entries){
            initPrefixBack(root, getQueue(entry.getValue()));
        }
    }

    /**
     * 递归调用创建前缀表,顺便获取字典树最终占用空间的大小
     */
    private synchronized void initPrefixBack(WordDataTreeNode i, Queue<WordDataTreeNode> queue){
        //递归结束标志
        if(i == null || queue == null || queue.size() < 1) return;

        //记录整个字典树占用的内存大小
        Long treeSize = 0l;
        //独立逻辑,累加大小(字节为单位)
        for (WordDataTreeNode estimateSize : queue){
            treeSize += SerializationUtils.serialize(estimateSize).length;
        }
        this.treeSize = treeSize;

        //记录下来,方便复原
        WordDataTreeNode storeI = i;
        //传统的构建前缀表形式 + 递归(回溯)
        for(WordDataTreeNode node : queue){

            while(i.getSubNodeMap() == null
                    || (i != root
                    && i.getSubNodeMap() != null
                    && ! i.getSubNodeMap()
                            .containsKey(node.getCharNow()))){
                i = i.getPrefix();
                if(i == null) i = root;
            }

            if(i.getSubNodeMap().containsKey(node.getCharNow())){
                i = i.getSubNodeMap().get(node.getCharNow());
            }

            node.setPrefix(i);

            //递归
            initPrefixBack(i, getQueue(node));

            //要回溯:复原,因为下一个循环路径不同
            i = storeI;
        }

    }

    /**
     * 获取节点子树中下一层所有节点构成的队列
     */
    private Queue<WordDataTreeNode> getQueue(WordDataTreeNode node){
        Map<Character, WordDataTreeNode> rootMap = node.getSubNodeMap();
        if (rootMap == null){
            return null;
        }

        Set<Map.Entry<Character, WordDataTreeNode>> entries = rootMap.entrySet();
        Queue<WordDataTreeNode> queue = new LinkedList();
        for(Map.Entry<Character, WordDataTreeNode> entry : entries){
            queue.add(entry.getValue());
        }

        return queue;
    }


    public List<String> containsFindAll(StringBuilder stringBuilder) {
        //判断是否已完成初始化
        if(this.root == null || root.getSubNodeMap().size() < 1) {
            return new ArrayList();
        }
        //System.out.println(stringBuilder);查看检验文本
        //已完成初始化,去做模式串匹配
        WordDataTreeNode nowNode = this.root;

        //初始化局部变量
        List<String> result = new ArrayList();
        int len = stringBuilder.length();
        int sensitiveWordLen = 0;

        //遍历文本
        for(int i = 0; i < len; i++) {

            //对文本字符串去重,默认关闭
            if(false && nowNode != root && i > 0) {
                char preMappingChar = stringBuilder.charAt(i-1);
                char mappingChar = stringBuilder.charAt(i);
                if(preMappingChar == mappingChar) {
                    sensitiveWordLen = nowNode.charNow == mappingChar? sensitiveWordLen + 1:sensitiveWordLen;
                    continue;
                }
            }

            //判断字符是否匹配,不匹配则根据匹配的前缀回退
                while (nowNode.getSubNodeMap() == null
                        || (nowNode != root
                        && !nowNode.getSubNodeMap()
                                    .containsKey(stringBuilder.charAt(i)))){
                    nowNode = nowNode.prefix;
                }

                if(nowNode.getSubNodeMap().containsKey(stringBuilder.charAt(i))) {
                    sensitiveWordLen++;
                    nowNode = nowNode.getSubNodeMap().get(stringBuilder.charAt(i));
                }else {
                    sensitiveWordLen = 0;
                }

            //判断是否为结尾,是结尾则加入结果集并重置敏感词长度为 0
            if(nowNode.end()) {
                String sWord = stringBuilder.substring(i - sensitiveWordLen + 1, i + 1);
                sensitiveWordLen = 0;
                result.add(sWord);
            }
        }

        return result;
    }

}

有待完成的:

支持正则化的匹配?