阅读 1281

基于PHP + TRIE树实现敏感词过滤算法

  • 公司新项目素材编辑功能需要提供敏感词过滤功能,于是上网查了下,很多都是基于trie算法的,但基于PHP写的却少有,或者部分存在bug。所以,自己在别人的基础上进行了完善。

敏感词过滤算法实现


class TreeMap
{
    public $data;  // 节点字符
    public $children = [];  // 存放子节点引用(因为有任意个子节点,所以靠数组来存储)
    public $isEndingChar = false;  // 是否是字符串结束字符

    public function __construct($data)
    {
        $this->data = $data;
    }
}

class TrieTree
{
    /**
     * 敏感词数组
     * 
     * @var array
     * @author qpf
     */
    public $trieTreeMap = array();

    public function __construct()
    {
        $this->trieTreeMap = new TreeMap('/');
    }

    /**
     * 获取敏感词Map
     * 
     * @return array
     * @author qpf
     */
    public function getTreeMap()
    {
        return $this->trieTreeMap;
    }

    /**
     * 添加敏感词
     * 
     * @param array $txtWords
     * @author qpf
     */
    public function addWords(array $wordsList)
    {
        foreach ($wordsList as $words) {
            $trieTreeMap = $this->trieTreeMap;
            $len = mb_strlen($words);
            for ($i = 0; $i < $len; $i++) {
                $word = mb_substr($words, $i, 1);
                if(!isset($trieTreeMap->children[$word])){
                    $newNode = new TreeMap($word);
                    $trieTreeMap->children[$word] = $newNode;
                }
                $trieTreeMap = $trieTreeMap->children[$word];
            }
            $trieTreeMap->isEndingChar = true;
        }
    }

    /**
     * 查找对应敏感词
     * 
     * @param string $txt
     * @return array
     * @author qpf
     */
    public function search($txt)
    {
        $wordsList = array();
        $txtLength = mb_strlen($txt);
        for ($i = 0; $i < $txtLength; $i++) {
            $wordLength = $this->checkWord($txt, $i, $txtLength);
            if($wordLength > 0) {
                echo $wordLength;
                $words = mb_substr($txt, $i, $wordLength);
                $wordsList[] = $words;
                $i += $wordLength - 1;
            }
        }
        return $wordsList;
    }

    /**
     * 敏感词检测
     * 
     * @param $txt
     * @param $beginIndex
     * @param $length
     * @return int
     */
    private function checkWord($txt, $beginIndex, $length)
    {
        $flag = false;
        $wordLength = 0;
        $trieTree = $this->trieTreeMap; //获取敏感词树
        for ($i = $beginIndex; $i < $length; $i++) {
            $word = mb_substr($txt, $i, 1); //检验单个字
            if (!isset($trieTree->children[$word])) { //如果树中不存在,结束        
                break;
            }
            //如果存在
            $wordLength++; 
            $trieTree = $trieTree->children[$word];
            if ($trieTree->isEndingChar === true) {  
                $flag = true;
                break;
            }
        }
        if($beginIndex > 0) {
            $flag || $wordLength = 0; //如果$flag == false  赋值$wordLenth为0
        }
        return $wordLength;
    }
    
}

$data = ['白粉', '白粉人', '白粉人嫩','不该大'];
$wordObj = new TrieTree();
$wordObj->addWords($data);

$txt = "白粉啊,白粉人,我不该大啊";
$words = $wordObj->search($txt);
var_dump($words);die;

复制代码