基于DFA算法的敏感词过滤

526 阅读2分钟

DFA算法

DFA即Deterministic Finite Automaton,翻译过来就是确定性有限自动机。
简单原理就是:在一个有限的集合,其中的元素都有两种状态,结束和继续(可以用0代表继续,1代表结束),
可以从一个元素检索到下一个元素,直到元素的状态为结束为止。

实例

public static void main(String[] args) {

    WordFilter filter = new WordFilter(Lists.newArrayList("哈哈","好日志","好儿子","上帝"));

    String text = "利于上帝行上游发展的哈哈哈哈包夜政策逐渐发布";
    System.out.println(filter.wordList(text));
    System.out.println(filter.wordCount(text));
    System.out.println(filter.replace(text));
}

输出:

[上帝, 哈哈]
4
利于**行上游发展的****包夜政策逐渐发布

代码

WordFilter 敏感词过滤器

public class WordFilter {


    private final WordNode wordNode;

    /**
     * 构造函数
     */
    public WordFilter(WordContext context) {
        this.wordNode = context.getWordNode();
    }

    public WordFilter(List<String> blackWords) {
        WordContext context = WordContext.getWordContext(blackWords);
        this.wordNode = context.getWordNode();
    }



    /**
     * 替换敏感词
     *
     * @param text 输入文本
     */
    public String replace(final String text) {
        return replace(text, 0, '*');
    }

    /**
     * 替换敏感词
     *
     * @param text   输入文本
     * @param symbol 替换符号
     */
    public String replace(final String text, final char symbol) {
        return replace(text, 0, symbol);
    }

    /**
     * 替换敏感词
     *
     * @param text   输入文本
     * @param skip   文本距离
     * @param symbol 替换符号
     */
    public String replace(final String text, final int skip, final char symbol) {
        char[] charset = text.toCharArray();
        for (int i = 0; i < charset.length; i++) {
            WordIndex wordIndex = getWordIndex(charset, i, skip);
            if (wordIndex.isBlackWord()) {
                for (int index : wordIndex.getIndex()) {
                    charset[index] = symbol;
                }
            }
        }
        return new String(charset);
    }

    /**
     * 是否包含敏感词
     *
     * @param text 输入文本
     */
    public boolean include(final String text) {
        return include(text, 0);
    }

    /**
     * 是否包含敏感词
     *
     * @param text 输入文本
     * @param skip 文本距离
     */
    public boolean include(final String text, final int skip) {
        char[] charset = text.toCharArray();
        for (int i = 0; i < charset.length; i++) {
            WordIndex wordIndex = getWordIndex(charset, i, skip);
            if(wordIndex.isBlackWord()) {
                return true;
            }
        }
        return false;
    }

    /**
     * 获取敏感词数量
     *
     * @param text 输入文本
     */
    public int wordCount(final String text) {
        return wordCount(text, 0);
    }

    /**
     * 获取敏感词数量
     *
     * @param text 输入文本
     * @param skip 文本距离
     */
    public int wordCount(final String text, final int skip) {
        int count = 0;
        char[] charset = text.toCharArray();
        for (int i = 0; i < charset.length; i++) {
            WordIndex wordIndex = getWordIndex(charset, i, skip);
            if (wordIndex.isBlackWord()) {
                count++;
            }
        }
        return count;
    }

    /**
     * 获取敏感词列表
     *
     * @param text 输入文本
     */
    public Set<String> wordList(final String text) {
        return wordList(text, 0);
    }

    /**
     * 获取敏感词列表
     *
     * @param text 输入文本
     * @param skip 文本距离
     */
    public Set<String> wordList(final String text, final int skip) {
        Set<String> wordList = new HashSet<>();
        char[] charset = text.toCharArray();
        for (int i = 0; i < charset.length; i++) {
            WordIndex wordIndex = getWordIndex(charset, i, skip);
            if (wordIndex.isBlackWord()) {
                wordList.add(wordIndex.getWord());
            }
        }
        return wordList;
    }

    /**
     * 获取标记索引
     *
     * @param charset 输入文本
     * @param begin   检测起始
     * @param skip    文本距离
     */
    private WordIndex getWordIndex(final char[] charset, final int begin, final int skip) {
        WordIndex wordIndex = new WordIndex();
        WordNode current = wordNode;
        boolean flag = false;
        int count = 0;
        List<Integer> index = new ArrayList<>();
        StringBuilder stringBuffer = new StringBuilder();
        for (int i = begin; i < charset.length; i++) {
            char word = charset[i];
            WordNode childNode = current.getChild(word);
            // 匹配的字不在黑名单里
            if (count > skip || (i == begin && Objects.isNull(childNode))) {
                break;
            }
            if (Objects.nonNull(childNode)) {
                current = childNode;
                count = 0;
                index.add(i);
                stringBuffer.append(word);
            } else {
                count++;
                if (flag && count > skip) {
                    break;
                }
            }
            flag = current.isLast();
        }

        wordIndex.setBlackWord(flag);
        wordIndex.setIndex(index);
        wordIndex.setWord(stringBuffer.toString());
        return wordIndex;
    }

    public static void main(String[] args) {

        WordFilter filter = new WordFilter(Lists.newArrayList("妓女","妓女槽","菊花洞","包夜"));

        String text = "利于妓女槽行业菊花上游发展的包夜政策逐渐发布";
        System.out.println(filter.wordList(text));
        System.out.println(filter.wordCount(text));
        System.out.println(filter.replace(text));

        //context.removeWord(Collections.singletonList("妓女"));
        //System.out.println(filter.wordList(text));
    }
}

WordContext 词库上下文

public class WordContext {

    private static volatile WordContext wordContext = null;
    private static volatile WordNode rootWordNode = null;

    private WordContext() {
    }

    public static WordContext getWordContext(List<String> blackList) {
        if (wordContext == null) {
            synchronized (WordContext.class) {
                if (wordContext == null) {
                    wordContext = new WordContext();
                    rootWordNode = new WordNode();
                    // 将敏感词库加入到HashMap中
                    addWord(blackList);
                }
            }
        }
        return wordContext;
    }


    public WordNode getWordNode() {
        return rootWordNode;
    }

    /**
     * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br>
     * 中 = { isEnd = 0 国 = {<br>
     * isEnd = 1 人 = {isEnd = 0 民 = {isEnd = 1} } 男 = { isEnd = 0 人 = { isEnd = 1 }
     * } } } 五 = { isEnd = 0 星 = { isEnd = 0 红 = { isEnd = 0 旗 = { isEnd = 1 } } } }
     */
    public static void addWord(Iterable<String> wordList) {
        WordNode wordNode;
        // 迭代keyWordSet
        for (String key : wordList) {
            wordNode = rootWordNode;
            for (int i = 0; i < key.length(); i++) {
                // 转换成char型
                char keyChar = key.charAt(i);
                // 获取
                WordNode childNode = wordNode.getChild(keyChar);
                // 如果存在该key,直接赋值
                if (childNode == null) {
                    boolean isLast = i == key.length() - 1;
                    childNode = new WordNode(isLast);
                    // 不是最后一个
                    wordNode.addChild(keyChar, childNode);
                }
                wordNode = childNode;
            }
        }
    }

    public void removeWord(Iterable<String> wordList) {
        WordNode wordNode;
        for (String key : wordList) {
            List<WordNode> cacheList = new ArrayList<>();
            wordNode = rootWordNode;
            for (int i = 0; i < key.length(); i++) {
                char keyChar = key.charAt(i);
                WordNode childNode = wordNode.getChild(keyChar);
                if (childNode == null) {
                    return;
                }
                wordNode = childNode;
                cacheList.add(wordNode);

                if (i == key.length() - 1) {
                    char[] keys = key.toCharArray();
                    boolean cleanable = false;
                    char lastChar = 0;
                    for (int j = cacheList.size() - 1; j >= 0; j--) {
                        WordNode wordNode1 = cacheList.get(j);
                        if (j == cacheList.size() - 1) {
                            wordNode1.setLast(false);
                            if (wordNode1.childSize() == 0) {
                                cleanable = true;
                                continue;
                            }
                        }
                        if (cleanable) {
                            if (wordNode1.isLast()) {
                                cleanable = false;
                            }
                            wordNode1.removeChild(lastChar);
                        }
                        lastChar = keys[j];
                    }

                    if (cleanable) {
                        rootWordNode.removeChild(lastChar);
                    }
                }
            }
        }
    }
    /**
     * 读取敏感词库中的内容,将内容添加到set集合中
     */
    private Set<String> readWordFile(String file) throws Exception {
        Set<String> set;
        // 字符编码
        String encoding = "UTF-8";
        try (InputStreamReader read = new InputStreamReader(
                this.getClass().getResourceAsStream(file), encoding)) {
            set = new HashSet<>();
            BufferedReader bufferedReader = new BufferedReader(read);
            String txt;
            // 读取文件,将文件内容放入到set中
            while ((txt = bufferedReader.readLine()) != null) {
                set.add(txt);
            }
        }
        // 关闭文件流
        return set;
    }
}

WordNode 敏感词组封装类

@Data
public class WordNode extends HashedMap<Character, WordNode>{

    /**
     * 最后一个字符
     */
    private boolean isLast;

    public WordNode() {
    }

    public void setLast(boolean isLast) {
        this.isLast = isLast;
    }

    public WordNode(boolean isLast) {
        this.isLast = isLast;
    }

    public WordNode addChild(Character key, WordNode wordNode) {
        return put(key, wordNode);
    }

    public WordNode removeChild(Character key) {
        return remove(key);
    }

    public WordNode getChild(Character key) {
        return get(key);
    }

    public int childSize() {
        return size();
    }

    @Override
    public String toString() {
        return "WordNode{" + "isLast=" + isLast +"," + super.toString() + '}';
    }
}

WordIndex 敏感词标记类

@Data
public class WordIndex {

    /**
     * 标记结果
     */
    private boolean isBlackWord;
    /**
     * 标记索引
     */
    private List<Integer> index;

    /**
     * 敏感词
     */
    private String word;
}