用前缀树实现中文敏感词过滤器

124 阅读2分钟

前言

本文代码实现一个中文的敏感词过滤器,预先将准备好的敏感词写入前缀树数据结构中实现快速检索,并且节省内存。一般用于检查注册用户名称、言论是否包含不文明的词汇。

可以判断内容是否包含敏感词;找出内容中的敏感词;将内容中的敏感词替换成设置的字符。

运行环境

代码使用了JDK8语法,以及测试框架Jupiter。以下是Maven配置:

<properties>
    <java.version>1.8</java.version>
    <maven.compiler.source>${java.version}</maven.compiler.source>
    <maven.compiler.target>${java.version}</maven.compiler.target>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties><dependencies>
    <dependency>
        <groupId>org.junit.jupiter</groupId>
        <artifactId>junit-jupiter</artifactId>
        <version>RELEASE</version>
        <scope>test</scope>
    </dependency>
</dependencies>

过滤器源码

import java.util.*;
import java.util.function.Predicate;
​
/**
 * 敏感词过滤器,限中文
 */
public class SensitiveWordFilters {
​
  /**
   * 如词典中有敏感词:[敏感, 敏感词]
   * true
   * ├── 匹配到[敏感]结束匹配
   * └── 比较省时,作简单判断用
   * false
   * ├── 匹配到[敏感词]才结束匹配
   * └── 相对费时,但是在替换敏感词的时候,能够替换掉更多匹配数据
   */
  private static final boolean SIMPLE_MATCH = false;
​
  /**
   * 忽略字符列表
   */
  private static final List<Character> IGNORE_CHAR_LIST = ignoreCharListInit();
​
  /**
   * 忽略部分字符
   * 如词典中有敏感词:[敏感词],现验证文本[敏 感 词],也会认定为敏感词,因为忽略了空格符
   * 同样在 重构字典、往字典中加敏感词时也会使用此断言
   */
  private static final Predicate<Character> CHAR_IGNORE =
      character -> Character.isSpaceChar(character) || IGNORE_CHAR_LIST.contains(character);
​
  /**
   * 重构字典
   */
  public static void refactoringBy(List<String> sensitiveWordList) {
    refactor(sensitiveWordList);
  }
​
  /**
   * 往字典中加敏感词
   */
  public static void add(List<String> sensitiveWordList) {
    sensitiveWordList.forEach(word -> recordToThe(SensitiveWordCache.dictionary, word));
  }
​
  /**
   * 往字典中加敏感词
   */
  public static void add(String sensitiveWord) {
    recordToThe(SensitiveWordCache.dictionary, sensitiveWord);
  }
​
  /**
   * true:text 中有敏感词
   */
  public static boolean foundIn(String text) {
    if (isEmpty(text)) {
      return false;
    }
​
    for (int i = 0; i < text.length(); i++) {
      if (checkSensitiveWord(text, i) > 0) {
        return true;
      }
    }
​
    return false;
  }
​
  /**
   * 从 text 中找出敏感词
   */
  public static Set<String> findOutFrom(String text) {
    if (isEmpty(text)) {
      return Collections.emptySet();
    }
​
    Set<String> resultSet = new TreeSet<>((o1, o2) -> o1.length() == o2.length() ? o1.compareTo(o2) : o2.length() - o1.length());
    for (int i = 0; i < text.length(); i++) {
      int endIndex = checkSensitiveWord(text, i);
      if (endIndex > 0) {
        resultSet.add(text.substring(i, ++endIndex));
      }
    }
​
    return resultSet;
  }
​
  /**
   * 替换 text 中的敏感词,每个字符换一个替换符
   *
   * @param text        文本
   * @param replaceChar 替换符
   * @return 替换后的文本
   */
  public static String replace(String text, String replaceChar) {
    Set<String> sensitiveWordSet = findOutFrom(text);
    if (sensitiveWordSet.isEmpty()) {
      return text;
    }
​
    for (String sensitiveWord : sensitiveWordSet) {
      text = text.replace(sensitiveWord, replacementOf(replaceChar, sensitiveWord.length()));
    }
    return text;
  }
​
  /**
   * 字典缓存
   */
  private static class SensitiveWordCache {
​
    /**
     * 字典/字典根节点
     */
    static Node dictionary;
​
    static {
      dictionary = new Node();
      dictionary.children = new HashMap<>(16);
    }
​
    private SensitiveWordCache() {
    }
  }
​
  /**
   * 重构字典
   *
   * @param sensitiveWordList 敏感字符列表
   */
  private static void refactor(List<String> sensitiveWordList) {
    Node newDictionary = new Node();
    newDictionary.children = new HashMap<>(16);
    synchronized (SensitiveWordCache.class) {
      for (String word : sensitiveWordList) {
        recordToThe(newDictionary, word);
      }
      SensitiveWordCache.dictionary = newDictionary;
    }
  }
​
  /**
   * 将敏感字符记录在节点上
   *
   * @param node 节点
   * @param word 敏感字符
   */
  private static void recordToThe(Node node, String word) {
    Objects.requireNonNull(node);
    synchronized (SensitiveWordCache.class) {
      for (int i = 0, lastIndex = word.length() - 1; i < word.length(); i++) {
        Character key = word.charAt(i);
​
        if (!CHAR_IGNORE.test(key)) {
          // 放置子节点
          Node next = node.get(key);
          if (Objects.isNull(next)) {
            next = new Node();
            node.putChild(key, next);
          }
          node = next;
        }
​
        if (i == lastIndex) {
          node.isEnd = true;
        }
      }
    }
  }
​
  /**
   * 从 startIndex 开始匹配敏感字符
   *
   * @param text       文本
   * @param startIndex 文本起始位置
   * @return 0-没有敏感字符,>0 敏感字符终止位置
   */
  private static int checkSensitiveWord(String text, int startIndex) {
    int endIndex = 0;
    Node node = SensitiveWordCache.dictionary;
    for (int i = startIndex; i < text.length(); i++) {
      Character key = text.charAt(i);
​
      if (CHAR_IGNORE.test(key)) {
        continue;
      }
​
      node = node.get(key);
      if (Objects.isNull(node)) {
        break;
      }
​
      if (node.isEnd) {
        endIndex = i;
        if (SIMPLE_MATCH) {
          break;
        }
      }
    }
​
    return endIndex;
  }
​
  private static boolean isEmpty(String str) {
    return str == null || "".equals(str);
  }
​
  /**
   * 生成完整的替换符
   *
   * @param replaceChar 单字符替换符
   * @param num         替换数量
   * @return 完整替换符
   */
  private static String replacementOf(String replaceChar, int num) {
    int minJointLength = 2;
    if (num < minJointLength) {
      return replaceChar;
    }
​
    StringBuilder replacement = new StringBuilder();
    for (int i = 0; i < num; i++) {
      replacement.append(replaceChar);
    }
    return replacement.toString();
  }
​
  /**
   * 字典数据节点
   */
  private static class Node {
    /**
     * true:敏感词结尾
     */
    boolean isEnd;
​
    /**
     * 子节点列表
     */
    Map<Character, Node> children;
​
    Node get(Character key) {
      return Objects.nonNull(children) ? children.get(key) : null;
    }
​
    void putChild(Character key, Node node) {
      if (Objects.isNull(children)) {
        children = new HashMap<>(16);
      }
      children.put(key, node);
    }
  }
​
  /**
   * 初始化忽略字符列表
   */
  private static List<Character> ignoreCharListInit() {
    List<Character> ignoreCharList = new ArrayList<>(10);
    ignoreCharList.add('|');
    ignoreCharList.add('-');
    return Collections.unmodifiableList(ignoreCharList);
  }
​
  private SensitiveWordFilters() {
  }
}

过滤器测试类

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
​
import java.util.Arrays;
​
class SensitiveWordFiltersTest {
​
  /**
   * 重构字典
   */
  @Test
  void refactoringBy() {
    SensitiveWordFilters.refactoringBy(Arrays.asList(getSensitiveWords()));
  }
​
  /**
   * 往字典中加敏感词
   */
  @Test
  void add() {
    SensitiveWordFilters.add(Arrays.asList("敏感词"));
  }
​
  /**
   * 判断内容是否包含敏感词
   */
  @Test
  void foundIn() {
    SensitiveWordFilters.refactoringBy(Arrays.asList(getSensitiveWords()));
    Assertions.assertTrue(SensitiveWordFilters.foundIn("白银混蛋"));
  }
​
  /**
   * 从内容中找出敏感词
   */
  @Test
  void findOutFrom() {
    SensitiveWordFilters.refactoringBy(Arrays.asList(getSensitiveWords()));
    System.out.println(SensitiveWordFilters.findOutFrom("白银混蛋"));
  }
​
  /**
   * 替换内容中的敏感词
   */
  @Test
  void replace() {
    SensitiveWordFilters.refactoringBy(Arrays.asList(getSensitiveWords()));
    String string = "就算是一个 顶-级 高 手,也会被那个白银 混蛋坑得很惨";
    System.out.println(SensitiveWordFilters.replace(string, "*"));
  }
​
  private static String[] getSensitiveWords() {
    return sensitiveWords.split("\|");
  }
​
  static final String sensitiveWords = "顶级|白银|混蛋";
}