DFA算法
DFA即Deterministic Finite Automaton,翻译过来就是确定性有限自动机。
简单原理就是:在一个有限的集合,其中的元素都有两种状态,结束和继续(可以用0代表继续,1代表结束),
可以从一个元素检索到下一个元素,直到元素的状态为结束为止。
实例
public static void main(String[] args) {
WordFilter filter = new WordFilter(Lists.newArrayList("哈哈","好日志","好儿子","上帝"));
String text = "利于上帝行上游发展的哈哈哈哈包夜政策逐渐发布";
System.out.println(filter.wordList(text));
System.out.println(filter.wordCount(text));
System.out.println(filter.replace(text));
}
输出:
[上帝, 哈哈]
4
利于**行上游发展的****包夜政策逐渐发布
代码
WordFilter 敏感词过滤器
public class WordFilter {
private final WordNode wordNode;
/**
* 构造函数
*/
public WordFilter(WordContext context) {
this.wordNode = context.getWordNode();
}
public WordFilter(List<String> blackWords) {
WordContext context = WordContext.getWordContext(blackWords);
this.wordNode = context.getWordNode();
}
/**
* 替换敏感词
*
* @param text 输入文本
*/
public String replace(final String text) {
return replace(text, 0, '*');
}
/**
* 替换敏感词
*
* @param text 输入文本
* @param symbol 替换符号
*/
public String replace(final String text, final char symbol) {
return replace(text, 0, symbol);
}
/**
* 替换敏感词
*
* @param text 输入文本
* @param skip 文本距离
* @param symbol 替换符号
*/
public String replace(final String text, final int skip, final char symbol) {
char[] charset = text.toCharArray();
for (int i = 0; i < charset.length; i++) {
WordIndex wordIndex = getWordIndex(charset, i, skip);
if (wordIndex.isBlackWord()) {
for (int index : wordIndex.getIndex()) {
charset[index] = symbol;
}
}
}
return new String(charset);
}
/**
* 是否包含敏感词
*
* @param text 输入文本
*/
public boolean include(final String text) {
return include(text, 0);
}
/**
* 是否包含敏感词
*
* @param text 输入文本
* @param skip 文本距离
*/
public boolean include(final String text, final int skip) {
char[] charset = text.toCharArray();
for (int i = 0; i < charset.length; i++) {
WordIndex wordIndex = getWordIndex(charset, i, skip);
if(wordIndex.isBlackWord()) {
return true;
}
}
return false;
}
/**
* 获取敏感词数量
*
* @param text 输入文本
*/
public int wordCount(final String text) {
return wordCount(text, 0);
}
/**
* 获取敏感词数量
*
* @param text 输入文本
* @param skip 文本距离
*/
public int wordCount(final String text, final int skip) {
int count = 0;
char[] charset = text.toCharArray();
for (int i = 0; i < charset.length; i++) {
WordIndex wordIndex = getWordIndex(charset, i, skip);
if (wordIndex.isBlackWord()) {
count++;
}
}
return count;
}
/**
* 获取敏感词列表
*
* @param text 输入文本
*/
public Set<String> wordList(final String text) {
return wordList(text, 0);
}
/**
* 获取敏感词列表
*
* @param text 输入文本
* @param skip 文本距离
*/
public Set<String> wordList(final String text, final int skip) {
Set<String> wordList = new HashSet<>();
char[] charset = text.toCharArray();
for (int i = 0; i < charset.length; i++) {
WordIndex wordIndex = getWordIndex(charset, i, skip);
if (wordIndex.isBlackWord()) {
wordList.add(wordIndex.getWord());
}
}
return wordList;
}
/**
* 获取标记索引
*
* @param charset 输入文本
* @param begin 检测起始
* @param skip 文本距离
*/
private WordIndex getWordIndex(final char[] charset, final int begin, final int skip) {
WordIndex wordIndex = new WordIndex();
WordNode current = wordNode;
boolean flag = false;
int count = 0;
List<Integer> index = new ArrayList<>();
StringBuilder stringBuffer = new StringBuilder();
for (int i = begin; i < charset.length; i++) {
char word = charset[i];
WordNode childNode = current.getChild(word);
// 匹配的字不在黑名单里
if (count > skip || (i == begin && Objects.isNull(childNode))) {
break;
}
if (Objects.nonNull(childNode)) {
current = childNode;
count = 0;
index.add(i);
stringBuffer.append(word);
} else {
count++;
if (flag && count > skip) {
break;
}
}
flag = current.isLast();
}
wordIndex.setBlackWord(flag);
wordIndex.setIndex(index);
wordIndex.setWord(stringBuffer.toString());
return wordIndex;
}
public static void main(String[] args) {
WordFilter filter = new WordFilter(Lists.newArrayList("妓女","妓女槽","菊花洞","包夜"));
String text = "利于妓女槽行业菊花上游发展的包夜政策逐渐发布";
System.out.println(filter.wordList(text));
System.out.println(filter.wordCount(text));
System.out.println(filter.replace(text));
//context.removeWord(Collections.singletonList("妓女"));
//System.out.println(filter.wordList(text));
}
}
WordContext 词库上下文
public class WordContext {
private static volatile WordContext wordContext = null;
private static volatile WordNode rootWordNode = null;
private WordContext() {
}
public static WordContext getWordContext(List<String> blackList) {
if (wordContext == null) {
synchronized (WordContext.class) {
if (wordContext == null) {
wordContext = new WordContext();
rootWordNode = new WordNode();
// 将敏感词库加入到HashMap中
addWord(blackList);
}
}
}
return wordContext;
}
public WordNode getWordNode() {
return rootWordNode;
}
/**
* 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br>
* 中 = { isEnd = 0 国 = {<br>
* isEnd = 1 人 = {isEnd = 0 民 = {isEnd = 1} } 男 = { isEnd = 0 人 = { isEnd = 1 }
* } } } 五 = { isEnd = 0 星 = { isEnd = 0 红 = { isEnd = 0 旗 = { isEnd = 1 } } } }
*/
public static void addWord(Iterable<String> wordList) {
WordNode wordNode;
// 迭代keyWordSet
for (String key : wordList) {
wordNode = rootWordNode;
for (int i = 0; i < key.length(); i++) {
// 转换成char型
char keyChar = key.charAt(i);
// 获取
WordNode childNode = wordNode.getChild(keyChar);
// 如果存在该key,直接赋值
if (childNode == null) {
boolean isLast = i == key.length() - 1;
childNode = new WordNode(isLast);
// 不是最后一个
wordNode.addChild(keyChar, childNode);
}
wordNode = childNode;
}
}
}
public void removeWord(Iterable<String> wordList) {
WordNode wordNode;
for (String key : wordList) {
List<WordNode> cacheList = new ArrayList<>();
wordNode = rootWordNode;
for (int i = 0; i < key.length(); i++) {
char keyChar = key.charAt(i);
WordNode childNode = wordNode.getChild(keyChar);
if (childNode == null) {
return;
}
wordNode = childNode;
cacheList.add(wordNode);
if (i == key.length() - 1) {
char[] keys = key.toCharArray();
boolean cleanable = false;
char lastChar = 0;
for (int j = cacheList.size() - 1; j >= 0; j--) {
WordNode wordNode1 = cacheList.get(j);
if (j == cacheList.size() - 1) {
wordNode1.setLast(false);
if (wordNode1.childSize() == 0) {
cleanable = true;
continue;
}
}
if (cleanable) {
if (wordNode1.isLast()) {
cleanable = false;
}
wordNode1.removeChild(lastChar);
}
lastChar = keys[j];
}
if (cleanable) {
rootWordNode.removeChild(lastChar);
}
}
}
}
}
/**
* 读取敏感词库中的内容,将内容添加到set集合中
*/
private Set<String> readWordFile(String file) throws Exception {
Set<String> set;
// 字符编码
String encoding = "UTF-8";
try (InputStreamReader read = new InputStreamReader(
this.getClass().getResourceAsStream(file), encoding)) {
set = new HashSet<>();
BufferedReader bufferedReader = new BufferedReader(read);
String txt;
// 读取文件,将文件内容放入到set中
while ((txt = bufferedReader.readLine()) != null) {
set.add(txt);
}
}
// 关闭文件流
return set;
}
}
WordNode 敏感词组封装类
@Data
public class WordNode extends HashedMap<Character, WordNode>{
/**
* 最后一个字符
*/
private boolean isLast;
public WordNode() {
}
public void setLast(boolean isLast) {
this.isLast = isLast;
}
public WordNode(boolean isLast) {
this.isLast = isLast;
}
public WordNode addChild(Character key, WordNode wordNode) {
return put(key, wordNode);
}
public WordNode removeChild(Character key) {
return remove(key);
}
public WordNode getChild(Character key) {
return get(key);
}
public int childSize() {
return size();
}
@Override
public String toString() {
return "WordNode{" + "isLast=" + isLast +"," + super.toString() + '}';
}
}
WordIndex 敏感词标记类
@Data
public class WordIndex {
/**
* 标记结果
*/
private boolean isBlackWord;
/**
* 标记索引
*/
private List<Integer> index;
/**
* 敏感词
*/
private String word;
}