很多评论需要做敏感词的过滤和屏蔽;
这里讲一下方法。在数据库里面建一个敏感词库表,一行记录存一个敏感词。
然后才有正则匹配的方法过滤,再加上单词划分法,两种办法同时进行。
过滤部分的算法:
单词划分法,是把一个单词作为一个节点树的节点,进行匹配:
/**
* 单个字符的节点
*/
class WordsNode
{
//是否为非法词汇末级节点
public $end = false;
//子节点
protected $child = [];
/**
* @param string $word
* @return WordNode
*/
public function getChildAlways($word)
{
if (!isset($this->child[$word])) {
$this->child[$word] = new self();
}
return $this->child[$word];
}
/**
* @param string $word
* @return WordNode|null
*/
public function getChild($word)
{
if ($word === '') {
return null;
}
if (isset($this->child[$word])) {
return $this->child[$word];
}
return null;
}
}
然后是匹配算法,这个算法里面有正则匹配部分和节点树匹配部分:
class WordsCheck
{
protected $tree = null;
protected $callIsNumeric = true;
protected $words_array = [];
/**
* 非法词汇列表,一个非法词汇占用一行
*/
public function __construct( )
{
$this->tree = new WordsNode();
$data = Words::words_array(); //从数据库读取到单词数组
foreach($data as $word) {
$word = trim(str_replace([' ',' ', "\n", "\r"],
['' ], $word));
if ( $word == '' ) {
continue;
}
$this->words_array[] = preg_quote(strtolower($word) ,'/');
$this->setTree( strtolower($word) );
}
//去掉重复的
$this->words_array = array_unique($this->words_array );
}
protected function setTree($word)
{
$array = $this->strToArr($word);
$tree = $this->tree;
$l = count($array) - 1;
foreach ($array as $k => $item) {
$tree = $tree->getChildAlways($item);
if ($l == $k) {
$tree->end = true;
}
}
}
/**
* @todo 敏感词过滤,返回结果
* @param string $string 要过滤的内容
* @return string $log 处理结果
*/
function preg_check( $string)
{
//先压缩字符串
$string = trim(str_replace([ ' ',' ', "\n", "\r"], ['' ],
strtolower($string)));
$patternList = [];
//分多次匹配
$page = 0;
$pagesize = 500;
$total = count($this->words_array);
$pagetotal = ceil($total/ $pagesize);
do {
$words = [];
$i = $page * $pagesize;
for($i; $i<= ($page +1 ) * $pagesize ; $i++){
if(isset($this->words_array[$i])){
$words[] = $this->words_array[$i];
}
}
$pattern = "/".implode("|",$words)."/i";
if (preg_match_all($pattern, $string, $matches)) {
$patternList = array_merge($patternList, $matches[0]);
}
$page = $page +1;
}while( $page < $pagetotal);
return $patternList;
}
/**
* 先采用正则匹配看能否匹配到,匹配不到采用单词匹配再匹配一次;
*/
public function check($str)
{
//新进行正则匹配
$ret = [];
try{
$ret = $this->preg_check($str);
if(empty($ret)){
//再进行单词匹配
$ret = $this->tree_check($str);
}
}catch(ErrorException $e){
$ret = $this->tree_check($str);
}
return $ret;
}
/**
* 返回包含的非法词汇
* @param string $str
* @return array
*/
public function tree_check($str)
{
//先压缩字符串
$str = trim(str_replace([' ',' ', "\n", "\r"], ['' ],
strtolower($str)));
$ret = [];
loop:
$strLen = strlen($str);
if ($strLen === 0) {
return array_unique($ret);
}
//挨个字符进行判断
$tree = $this->tree;
$words = '';
for ($i = 0; $i < $strLen; $i++) {
//unicode范围 --> ord 范围
//一字节 0-127 --> 0 - 127
//二字节 128-2047 --> 194 - 223
//三字节 2048-65535 --> 224 - 239
//四字节 65536-1114111 --> 240 - 244
$ord = ord($str[$i]);
if ($ord <= 127) {
$word = $str[$i];
} elseif ($ord <= 223) {
$word = $str[$i] . $str[$i + 1];
$i += 1;
} elseif ($ord <= 239) {
$word = $str[$i] . $str[$i + 1] . $str[$i + 2];
$i += 2;
} elseif ($ord <= 244) {
//四字节
$word = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
$i += 3;
} else {
//五字节php都溢出了
//Parse error: Invalid UTF-8 codepoint escape sequence: Codepoint too large
continue;
}
//判断当前字符
$tree = $tree->getChild($word);
if (is_null($tree)) {
//当前字不存在,则截取后再次循环
$str = substr($str, $i +1 );
goto loop;
} else {
$words .= $word;
if ($tree->end) {
$ret[] = $words;
$words = "";
$tree = $this->tree;
}
}
}
return array_unique($ret);
}
protected function strToArr($str)
{
$array = [];
$strLen = mb_strlen($str);
for ($i = 0; $i < $strLen; $i++) {
$array[] = mb_substr($str, $i, 1, "utf8");
}
return $array;
}
}
要过滤的时候:
$wordsCheck = new WordsCheck();
$badwords = $wordsCheck->check($comment);
//得到badwords就是 comment里面含有的敏感词数组。
好了本文内容全部结束,感谢您的阅读,希望对您有所帮助。