123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396 |
- <?php
- /**
- *
- * Author: abel
- * Date: 17/4/10
- * Time: 17:26
- */
- namespace AbelZhou\Tree;
- /**
- * 这是一个支持中文的字典树
- * Class TrieTree
- * @package AbelZhou\Tree
- */
- class TrieTree {
- protected $nodeTree = [];
- /**
- * 构造
- * TrieTree constructor.
- */
- public function __construct() {
- }
- /**
- * 从树种摘除一个文本
- * @param $index_str
- */
- public function delete($index_str, $deltree = false, $is_py = false, $chinese = "") {
- $str = trim($index_str);
- $chinese = trim($chinese);
- if ($is_py && empty($chinese)) {
- return false;
- }
- $delstr_arr = $this->convertStrToH($str);
- $len = count($delstr_arr);
- //提取树
- $childTree = &$this->nodeTree;
- $del_index = array();
- //提取树中的相关索引
- for ($i = 0; $i < $len; $i++) {
- $code = $delstr_arr[$i];
- //命中将其纳入索引范围
- if (isset($childTree[$code])) {
- //del tree
- $del_index[$i] = [
- 'code' => $code,
- 'index' => &$childTree[$code]
- ];
- //若检索到最后一个字,检查是否是一个关键词的末梢
- if ($i == ($len - 1) && !$childTree[$code]['end']) {
- return false;
- }
- $childTree = &$childTree[$code]['child'];
- } else {
- //发现没有命中 删除失败
- return false;
- }
- }
- $idx = $len - 1;
- //删除整棵树
- if ($deltree) {
- //清空子集
- $del_index[$idx]['index']['child'] = array();
- }
- //只有一个字 直接删除
- if ($idx == 0) {
- //如果是拼音 只删除相应的拼音索引
- if ($is_py) {
- //清除单个拼音索引
- if (isset($this->nodeTree[$del_index[$idx]['code']]['chinese_list'])) {
- $is_del = false;
- foreach ($this->nodeTree[$del_index[$idx]['code']]['chinese_list'] as $key=>$node) {
- if ($node['word'] == $chinese){
- unset($this->nodeTree[$del_index[$idx]['code']]['chinese_list'][$key]);
- $is_del = true;
- break;
- }
- }
- if($is_del && 0 != count($this->nodeTree[$del_index[$idx]['code']]['chinese_list'])){
- return true;
- }
- if(!$is_del){
- return false;
- }
- //如果依然存在中文数据 则继续向下跑删除节点
- }
- }else{
- if (count($del_index[$idx]['index']['child']) == 0) {
- unset($this->nodeTree[$del_index[$idx]['code']]);
- return true;
- }
- }
- }
- //末梢为关键词结尾,且存在子集 清除结尾标签
- if (count($del_index[$idx]['index']['child']) > 0) {
- $del_index[$idx]['index']['end'] = false;
- $del_index[$idx]['index']['data'] = array();
- unset($del_index[$idx]['index']['full']);
- return true;
- }
- // var_dump($this->nodeTree['e59bbd']);exit();
- //以下为末梢不存在子集的情况
- //倒序检索 子集大于2的 清除child
- for (; $idx >= 0; $idx--) {
- //检测子集 若发现联字情况 检测是否为其他关键词结尾
- if (count($del_index[$idx]['index']['child']) > 0) {
- //遇到结束标记或者count>1的未结束节点直接清空子集跳出
- if ($del_index[$idx]['index']['end'] == true || $del_index[$idx]['index']['child'] > 1) {
- //清空子集
- $child_code = $del_index[$idx + 1]['code'];
- unset($del_index[$idx]['index']['child'][$child_code]);
- return true;
- }
- }
- }
- return false;
- }
- /**
- * ADD word [UTF8]
- * 增加新特性,在质感末梢增加自定义数组
- * @param $index_str 添加的词
- * @param array $data 添加词的附加属性
- * @return $this
- */
- public function append($index_str, $data = array(), $is_py = false, $chinese = '') {
- $str = trim($index_str);
- $chinese = trim($chinese);
- if ($is_py && empty($chinese)) {
- return false;
- }
- $childTree = &$this->nodeTree;
- $len = strlen($str);
- for ($i = 0; $i < $len; $i++) {
- $ascii_code = ord($str[$i]);
- $code = NULL;
- $word = NULL;
- $is_end = false;
- if (($ascii_code >> 7) == 0) {
- $code = dechex(ord($str[$i]));
- $word = $str[$i];
- } else if (($ascii_code >> 4) == 15) { //1111 xxxx, 四字节
- if ($i < $len - 3) {
- $code = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2])) . dechex(ord($str[$i + 3]));
- $word = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
- $i += 3;
- }
- } else if (($ascii_code >> 5) == 7) { //111x xxxx, 三字节
- if ($i < $len - 2) {
- $code = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2]));
- $word = $str[$i] . $str[$i + 1] . $str[$i + 2];
- $i += 2;
- }
- } else if (($ascii_code >> 6) == 3) { //11xx xxxx, 2字节
- if ($i < $len - 1) {
- $code = dechex(ord($str[$i])) . dechex(ord($str[$i + 1]));
- $word = $str[$i] . $str[$i + 1];
- $i++;
- }
- }
- if ($i == ($len - 1)) {
- $is_end = true;
- if ($is_py) {
- $str = $chinese;
- }
- }
- $childTree = &$this->_appendWordToTree($childTree, $code, $word, $is_end, $data, $str, $is_py);
- }
- unset($childTree);
- return $this;
- }
- /**
- * 追加一个字[中英文]到树中
- * @param $tree
- * @param $code
- * @param $word
- * @param bool $end
- * @param array $data
- * @param string $full_str
- * @return mixed
- */
- private function &_appendWordToTree(&$tree, $code, $word, $end = false, $data = array(), $full_str = '', $is_py) {
- if (!isset($tree[$code])) {
- $tree[$code] = array(
- 'end' => $end,
- 'child' => array(),
- 'value' => $word,
- );
- }
- if ($end) {
- $tree[$code]['end'] = true;
- $tree[$code]['is_py'] = $is_py;
- //拼音不需要full 拼音根据读音多样性对应多个词语 重复词语覆盖data
- if ($is_py) {
- $is_change = false;
- if(isset($tree[$code]["chinese_list"]) && count($tree[$code]["chinese_list"])>0) {
- foreach ($tree[$code]["chinese_list"] as $key => &$node) {
- if ($node['word'] == $full_str) {
- $node['data'] = $data;
- $is_change = true;
- break;
- }
- }
- }
- if(!$is_change){
- $tree[$code]['chinese_list'][] = ["word" => $full_str, "data" => $data];
- }
- } else {
- $tree[$code]['full'] = $full_str;
- $tree[$code]['data'] = $data;
- }
- }
- return $tree[$code]['child'];
- }
- /**
- * 获得整棵树
- * @return array
- */
- public function getTree() {
- return $this->nodeTree;
- }
- /**
- * 匹配下面的全部词语
- * @param $word
- * @param int $deep 检索深度 检索之后的词语数量可能会大于这个数字
- * @return array|bool
- */
- public function getTreeWord($word, $deep = 0) {
- $search = trim($word);
- if (empty($search)) {
- return false;
- }
- if ($deep === 0) {
- $deep = 999;
- }
- $word_keys = $this->convertStrToH($search);
- $tree = &$this->nodeTree;
- $key_count = count($word_keys);
- $words = [];
- foreach ($word_keys as $key => $val) {
- if (isset($tree[$val])) {
- //检测当前词语是否已命中
- if ($key == $key_count - 1 && $tree[$val]['end'] == true) {
- if (isset($tree[$val]['chinese_list'])) {
- $words = array_merge($words, $tree[$val]['chinese_list']);
- } else {
- $words[] = ["word" => $tree[$val]['full'], "data" => $tree[$val]['data']];
- }
- }
- $tree = &$tree[$val]["child"];
- } else {
- //遇到没有命中的返回
- // if ($key == 0) {
- return [];
- // }
- }
- }
- $this->_getTreeWord($tree, $deep, $words);
- return $words;
- }
- private function _getTreeWord(&$child, $deep, &$words = array()) {
- foreach ($child as $node) {
- if ($node['end'] == true) {
- if (isset($node['chinese_list'])) {
- $words = array_merge($words, $node['chinese_list']);
- } else {
- $words[] = ["word" => $node['full'], "data" => $node['data']];
- }
- }
- if (!empty($node['child']) && $deep >= count($words)) {
- $this->_getTreeWord($node['child'], $deep, $words);
- }
- }
- }
- /**
- * overwrite tostring.
- * @return string
- */
- public function __toString() {
- // TODO: Implement __toString() method.
- return json_encode($this->nodeTree);
- }
- /**
- * 检索
- * @param $search
- * @return array|bool
- */
- public function search($search) {
- $search = trim($search);
- if (empty($search)) {
- return false;
- }
- $search_keys = $this->convertStrToH($search);
- //命中集合
- $hit_arr = array();
- $tree = &$this->nodeTree;
- $arr_len = count($search_keys);
- $current_index = 0;
- for ($i = 0; $i < $arr_len; $i++) {
- //若命中了一个索引 则继续向下寻找
- if (isset($tree[$search_keys[$i]])) {
- $node = $tree[$search_keys[$i]];
- if ($node['end']) {
- //发现结尾 将原词以及自定义属性纳入返回结果集中 3.1 增加词频统计
- $key = md5($node['full']);
- if (isset($hit_arr[$key])) {
- $hit_arr[$key]['count'] += 1;
- } else {
- $hit_arr[$key] = array(
- 'word' => $node['full'],
- 'data' => $node['data'],
- 'count' => 1
- );
- }
- if (empty($node['child'])) {
- //若不存在子集,检索游标还原
- $i = $current_index;
- //还原检索集合
- $tree = &$this->nodeTree;
- //字码游标下移
- $current_index++;
- } else {
- //存在子集重定义检索tree
- $tree = &$tree[$search_keys[$i]]['child'];
- }
- continue;
- } else {
- //没发现结尾继续向下检索
- $tree = &$tree[$search_keys[$i]]['child'];
- continue;
- }
- } else {
- //还原检索起点
- $i = $current_index;
- //还原tree
- $tree = &$this->nodeTree;
- //字码位移
- $current_index++;
- continue;
- }
- }
- unset($tree, $search_keys);
- return $hit_arr;
- }
- /**
- * 将字符转为16进制标示
- * @param $str
- * @return array
- */
- public function convertStrToH($str) {
- $len = strlen($str);
- $chars = [];
- for ($i = 0; $i < $len; $i++) {
- $ascii_code = ord($str[$i]);
- if (($ascii_code >> 7) == 0) {
- $chars[] = dechex(ord($str[$i]));
- } else if (($ascii_code >> 4) == 15) { //1111 xxxx, 四字节
- if ($i < $len - 3) {
- $chars[] = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2])) . dechex(ord($str[$i + 3]));
- $i += 3;
- }
- } else if (($ascii_code >> 5) == 7) { //111x xxxx, 三字节
- if ($i < $len - 2) {
- $chars[] = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2]));
- $i += 2;
- }
- } else if (($ascii_code >> 6) == 3) { //11xx xxxx, 2字节
- if ($i < $len - 1) {
- $chars[] = dechex(ord($str[$i])) . dechex(ord($str[$i + 1]));
- $i++;
- }
- }
- }
- return $chars;
- }
- }
|