<?php /** * * Author: abel * Date: 17/4/10 * Time: 17:26 */ namespace AbelZhou\Tree; /** * 这是一个支持中文的字典树 * Class TrieTree * @package AbelZhou\Tree */ class TrieTree { protected $nodeTree = []; /** * 构造 * TrieTree constructor. */ public function __construct() { } /** * 从树种摘除一个文本 * @param $index_str */ public function delete($index_str, $deltree = false, $is_py = false, $chinese = "") { $str = trim($index_str); $chinese = trim($chinese); if ($is_py && empty($chinese)) { return false; } $delstr_arr = $this->convertStrToH($str); $len = count($delstr_arr); //提取树 $childTree = &$this->nodeTree; $del_index = array(); //提取树中的相关索引 for ($i = 0; $i < $len; $i++) { $code = $delstr_arr[$i]; //命中将其纳入索引范围 if (isset($childTree[$code])) { //del tree $del_index[$i] = [ 'code' => $code, 'index' => &$childTree[$code] ]; //若检索到最后一个字,检查是否是一个关键词的末梢 if ($i == ($len - 1) && !$childTree[$code]['end']) { return false; } $childTree = &$childTree[$code]['child']; } else { //发现没有命中 删除失败 return false; } } $idx = $len - 1; //删除整棵树 if ($deltree) { //清空子集 $del_index[$idx]['index']['child'] = array(); } //只有一个字 直接删除 if ($idx == 0) { //如果是拼音 只删除相应的拼音索引 if ($is_py) { //清除单个拼音索引 if (isset($this->nodeTree[$del_index[$idx]['code']]['chinese_list'])) { $is_del = false; foreach ($this->nodeTree[$del_index[$idx]['code']]['chinese_list'] as $key=>$node) { if ($node['word'] == $chinese){ unset($this->nodeTree[$del_index[$idx]['code']]['chinese_list'][$key]); $is_del = true; break; } } if($is_del && 0 != count($this->nodeTree[$del_index[$idx]['code']]['chinese_list'])){ return true; } if(!$is_del){ return false; } //如果依然存在中文数据 则继续向下跑删除节点 } }else{ if (count($del_index[$idx]['index']['child']) == 0) { unset($this->nodeTree[$del_index[$idx]['code']]); return true; } } } //末梢为关键词结尾,且存在子集 清除结尾标签 if (count($del_index[$idx]['index']['child']) > 0) { $del_index[$idx]['index']['end'] = false; $del_index[$idx]['index']['data'] = array(); unset($del_index[$idx]['index']['full']); return true; } // var_dump($this->nodeTree['e59bbd']);exit(); //以下为末梢不存在子集的情况 //倒序检索 子集大于2的 清除child for (; $idx >= 0; $idx--) { //检测子集 若发现联字情况 检测是否为其他关键词结尾 if (count($del_index[$idx]['index']['child']) > 0) { //遇到结束标记或者count>1的未结束节点直接清空子集跳出 if ($del_index[$idx]['index']['end'] == true || $del_index[$idx]['index']['child'] > 1) { //清空子集 $child_code = $del_index[$idx + 1]['code']; unset($del_index[$idx]['index']['child'][$child_code]); return true; } } } return false; } /** * ADD word [UTF8] * 增加新特性,在质感末梢增加自定义数组 * @param $index_str 添加的词 * @param array $data 添加词的附加属性 * @return $this */ public function append($index_str, $data = array(), $is_py = false, $chinese = '') { $str = trim($index_str); $chinese = trim($chinese); if ($is_py && empty($chinese)) { return false; } $childTree = &$this->nodeTree; $len = strlen($str); for ($i = 0; $i < $len; $i++) { $ascii_code = ord($str[$i]); $code = NULL; $word = NULL; $is_end = false; if (($ascii_code >> 7) == 0) { $code = dechex(ord($str[$i])); $word = $str[$i]; } else if (($ascii_code >> 4) == 15) { //1111 xxxx, 四字节 if ($i < $len - 3) { $code = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2])) . dechex(ord($str[$i + 3])); $word = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3]; $i += 3; } } else if (($ascii_code >> 5) == 7) { //111x xxxx, 三字节 if ($i < $len - 2) { $code = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2])); $word = $str[$i] . $str[$i + 1] . $str[$i + 2]; $i += 2; } } else if (($ascii_code >> 6) == 3) { //11xx xxxx, 2字节 if ($i < $len - 1) { $code = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])); $word = $str[$i] . $str[$i + 1]; $i++; } } if ($i == ($len - 1)) { $is_end = true; if ($is_py) { $str = $chinese; } } $childTree = &$this->_appendWordToTree($childTree, $code, $word, $is_end, $data, $str, $is_py); } unset($childTree); return $this; } /** * 追加一个字[中英文]到树中 * @param $tree * @param $code * @param $word * @param bool $end * @param array $data * @param string $full_str * @return mixed */ private function &_appendWordToTree(&$tree, $code, $word, $end = false, $data = array(), $full_str = '', $is_py) { if (!isset($tree[$code])) { $tree[$code] = array( 'end' => $end, 'child' => array(), 'value' => $word, ); } if ($end) { $tree[$code]['end'] = true; $tree[$code]['is_py'] = $is_py; //拼音不需要full 拼音根据读音多样性对应多个词语 重复词语覆盖data if ($is_py) { $is_change = false; if(isset($tree[$code]["chinese_list"]) && count($tree[$code]["chinese_list"])>0) { foreach ($tree[$code]["chinese_list"] as $key => &$node) { if ($node['word'] == $full_str) { $node['data'] = $data; $is_change = true; break; } } } if(!$is_change){ $tree[$code]['chinese_list'][] = ["word" => $full_str, "data" => $data]; } } else { $tree[$code]['full'] = $full_str; $tree[$code]['data'] = $data; } } return $tree[$code]['child']; } /** * 获得整棵树 * @return array */ public function getTree() { return $this->nodeTree; } /** * 匹配下面的全部词语 * @param $word * @param int $deep 检索深度 检索之后的词语数量可能会大于这个数字 * @return array|bool */ public function getTreeWord($word, $deep = 0) { $search = trim($word); if (empty($search)) { return false; } if ($deep === 0) { $deep = 999; } $word_keys = $this->convertStrToH($search); $tree = &$this->nodeTree; $key_count = count($word_keys); $words = []; foreach ($word_keys as $key => $val) { if (isset($tree[$val])) { //检测当前词语是否已命中 if ($key == $key_count - 1 && $tree[$val]['end'] == true) { if (isset($tree[$val]['chinese_list'])) { $words = array_merge($words, $tree[$val]['chinese_list']); } else { $words[] = ["word" => $tree[$val]['full'], "data" => $tree[$val]['data']]; } } $tree = &$tree[$val]["child"]; } else { //遇到没有命中的返回 // if ($key == 0) { return []; // } } } $this->_getTreeWord($tree, $deep, $words); return $words; } private function _getTreeWord(&$child, $deep, &$words = array()) { foreach ($child as $node) { if ($node['end'] == true) { if (isset($node['chinese_list'])) { $words = array_merge($words, $node['chinese_list']); } else { $words[] = ["word" => $node['full'], "data" => $node['data']]; } } if (!empty($node['child']) && $deep >= count($words)) { $this->_getTreeWord($node['child'], $deep, $words); } } } /** * overwrite tostring. * @return string */ public function __toString() { // TODO: Implement __toString() method. return json_encode($this->nodeTree); } /** * 检索 * @param $search * @return array|bool */ public function search($search) { $search = trim($search); if (empty($search)) { return false; } $search_keys = $this->convertStrToH($search); //命中集合 $hit_arr = array(); $tree = &$this->nodeTree; $arr_len = count($search_keys); $current_index = 0; for ($i = 0; $i < $arr_len; $i++) { //若命中了一个索引 则继续向下寻找 if (isset($tree[$search_keys[$i]])) { $node = $tree[$search_keys[$i]]; if ($node['end']) { //发现结尾 将原词以及自定义属性纳入返回结果集中 3.1 增加词频统计 $key = md5($node['full']); if (isset($hit_arr[$key])) { $hit_arr[$key]['count'] += 1; } else { $hit_arr[$key] = array( 'word' => $node['full'], 'data' => $node['data'], 'count' => 1 ); } if (empty($node['child'])) { //若不存在子集,检索游标还原 $i = $current_index; //还原检索集合 $tree = &$this->nodeTree; //字码游标下移 $current_index++; } else { //存在子集重定义检索tree $tree = &$tree[$search_keys[$i]]['child']; } continue; } else { //没发现结尾继续向下检索 $tree = &$tree[$search_keys[$i]]['child']; continue; } } else { //还原检索起点 $i = $current_index; //还原tree $tree = &$this->nodeTree; //字码位移 $current_index++; continue; } } unset($tree, $search_keys); return $hit_arr; } /** * 将字符转为16进制标示 * @param $str * @return array */ public function convertStrToH($str) { $len = strlen($str); $chars = []; for ($i = 0; $i < $len; $i++) { $ascii_code = ord($str[$i]); if (($ascii_code >> 7) == 0) { $chars[] = dechex(ord($str[$i])); } else if (($ascii_code >> 4) == 15) { //1111 xxxx, 四字节 if ($i < $len - 3) { $chars[] = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2])) . dechex(ord($str[$i + 3])); $i += 3; } } else if (($ascii_code >> 5) == 7) { //111x xxxx, 三字节 if ($i < $len - 2) { $chars[] = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2])); $i += 2; } } else if (($ascii_code >> 6) == 3) { //11xx xxxx, 2字节 if ($i < $len - 1) { $chars[] = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])); $i++; } } } return $chars; } }