| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396 | <?php/** * * Author: abel * Date: 17/4/10 * Time: 17:26 */namespace AbelZhou\Tree;/** * 这是一个支持中文的字典树 * Class TrieTree * @package AbelZhou\Tree */class TrieTree {    protected $nodeTree = [];    /**     * 构造     * TrieTree constructor.     */    public function __construct() {    }    /**     * 从树种摘除一个文本     * @param $index_str     */    public function delete($index_str, $deltree = false, $is_py = false, $chinese = "") {        $str = trim($index_str);        $chinese = trim($chinese);        if ($is_py && empty($chinese)) {            return false;        }        $delstr_arr = $this->convertStrToH($str);        $len = count($delstr_arr);        //提取树        $childTree = &$this->nodeTree;        $del_index = array();        //提取树中的相关索引        for ($i = 0; $i < $len; $i++) {            $code = $delstr_arr[$i];            //命中将其纳入索引范围            if (isset($childTree[$code])) {                //del tree                $del_index[$i] = [                    'code' => $code,                    'index' => &$childTree[$code]                ];                //若检索到最后一个字,检查是否是一个关键词的末梢                if ($i == ($len - 1) && !$childTree[$code]['end']) {                    return false;                }                $childTree = &$childTree[$code]['child'];            } else {                //发现没有命中 删除失败                return false;            }        }        $idx = $len - 1;        //删除整棵树        if ($deltree) {            //清空子集            $del_index[$idx]['index']['child'] = array();        }        //只有一个字 直接删除        if ($idx == 0) {            //如果是拼音 只删除相应的拼音索引            if ($is_py) {                //清除单个拼音索引                if (isset($this->nodeTree[$del_index[$idx]['code']]['chinese_list'])) {                    $is_del = false;                    foreach ($this->nodeTree[$del_index[$idx]['code']]['chinese_list'] as $key=>$node) {                        if ($node['word'] == $chinese){                            unset($this->nodeTree[$del_index[$idx]['code']]['chinese_list'][$key]);                            $is_del = true;                            break;                        }                    }                    if($is_del && 0 != count($this->nodeTree[$del_index[$idx]['code']]['chinese_list'])){                         return true;                    }                    if(!$is_del){                        return false;                    }                    //如果依然存在中文数据 则继续向下跑删除节点                }            }else{                if (count($del_index[$idx]['index']['child']) == 0) {                    unset($this->nodeTree[$del_index[$idx]['code']]);                    return true;                }            }        }        //末梢为关键词结尾,且存在子集 清除结尾标签        if (count($del_index[$idx]['index']['child']) > 0) {            $del_index[$idx]['index']['end'] = false;            $del_index[$idx]['index']['data'] = array();            unset($del_index[$idx]['index']['full']);            return true;        }//        var_dump($this->nodeTree['e59bbd']);exit();        //以下为末梢不存在子集的情况        //倒序检索 子集大于2的 清除child        for (; $idx >= 0; $idx--) {            //检测子集 若发现联字情况 检测是否为其他关键词结尾            if (count($del_index[$idx]['index']['child']) > 0) {                //遇到结束标记或者count>1的未结束节点直接清空子集跳出                if ($del_index[$idx]['index']['end'] == true || $del_index[$idx]['index']['child'] > 1) {                    //清空子集                    $child_code = $del_index[$idx + 1]['code'];                    unset($del_index[$idx]['index']['child'][$child_code]);                    return true;                }            }        }        return false;    }    /**     * ADD word [UTF8]     * 增加新特性,在质感末梢增加自定义数组     * @param $index_str 添加的词     * @param array $data 添加词的附加属性     * @return $this     */    public function append($index_str, $data = array(), $is_py = false, $chinese = '') {        $str = trim($index_str);        $chinese = trim($chinese);        if ($is_py && empty($chinese)) {            return false;        }        $childTree = &$this->nodeTree;        $len = strlen($str);        for ($i = 0; $i < $len; $i++) {            $ascii_code = ord($str[$i]);            $code = NULL;            $word = NULL;            $is_end = false;            if (($ascii_code >> 7) == 0) {                $code = dechex(ord($str[$i]));                $word = $str[$i];            } else if (($ascii_code >> 4) == 15) {    //1111 xxxx, 四字节                if ($i < $len - 3) {                    $code = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2])) . dechex(ord($str[$i + 3]));                    $word = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];                    $i += 3;                }            } else if (($ascii_code >> 5) == 7) {    //111x xxxx, 三字节                if ($i < $len - 2) {                    $code = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2]));                    $word = $str[$i] . $str[$i + 1] . $str[$i + 2];                    $i += 2;                }            } else if (($ascii_code >> 6) == 3) {    //11xx xxxx, 2字节                if ($i < $len - 1) {                    $code = dechex(ord($str[$i])) . dechex(ord($str[$i + 1]));                    $word = $str[$i] . $str[$i + 1];                    $i++;                }            }            if ($i == ($len - 1)) {                $is_end = true;                if ($is_py) {                    $str = $chinese;                }            }            $childTree = &$this->_appendWordToTree($childTree, $code, $word, $is_end, $data, $str, $is_py);        }        unset($childTree);        return $this;    }    /**     * 追加一个字[中英文]到树中     * @param $tree     * @param $code     * @param $word     * @param bool $end     * @param array $data     * @param string $full_str     * @return mixed     */    private function &_appendWordToTree(&$tree, $code, $word, $end = false, $data = array(), $full_str = '', $is_py) {        if (!isset($tree[$code])) {            $tree[$code] = array(                'end' => $end,                'child' => array(),                'value' => $word,            );        }        if ($end) {            $tree[$code]['end'] = true;            $tree[$code]['is_py'] = $is_py;            //拼音不需要full 拼音根据读音多样性对应多个词语 重复词语覆盖data            if ($is_py) {                $is_change = false;                if(isset($tree[$code]["chinese_list"]) && count($tree[$code]["chinese_list"])>0) {                    foreach ($tree[$code]["chinese_list"] as $key => &$node) {                        if ($node['word'] == $full_str) {                            $node['data'] = $data;                            $is_change = true;                            break;                        }                    }                }                if(!$is_change){                    $tree[$code]['chinese_list'][] = ["word" => $full_str, "data" => $data];                }            } else {                $tree[$code]['full'] = $full_str;                $tree[$code]['data'] = $data;            }        }        return $tree[$code]['child'];    }    /**     * 获得整棵树     * @return array     */    public function getTree() {        return $this->nodeTree;    }    /**     * 匹配下面的全部词语     * @param $word     * @param int $deep 检索深度 检索之后的词语数量可能会大于这个数字     * @return array|bool     */    public function getTreeWord($word, $deep = 0) {        $search = trim($word);        if (empty($search)) {            return false;        }        if ($deep === 0) {            $deep = 999;        }        $word_keys = $this->convertStrToH($search);        $tree = &$this->nodeTree;        $key_count = count($word_keys);        $words = [];        foreach ($word_keys as $key => $val) {            if (isset($tree[$val])) {                //检测当前词语是否已命中                if ($key == $key_count - 1 && $tree[$val]['end'] == true) {                    if (isset($tree[$val]['chinese_list'])) {                        $words = array_merge($words, $tree[$val]['chinese_list']);                    } else {                        $words[] = ["word" => $tree[$val]['full'], "data" => $tree[$val]['data']];                    }                }                $tree = &$tree[$val]["child"];            } else {                //遇到没有命中的返回//                if ($key == 0) {                    return [];//                }            }        }        $this->_getTreeWord($tree, $deep, $words);        return $words;    }    private function _getTreeWord(&$child, $deep, &$words = array()) {        foreach ($child as $node) {            if ($node['end'] == true) {                if (isset($node['chinese_list'])) {                    $words = array_merge($words, $node['chinese_list']);                } else {                    $words[] = ["word" => $node['full'], "data" => $node['data']];                }            }            if (!empty($node['child']) && $deep >= count($words)) {                $this->_getTreeWord($node['child'], $deep, $words);            }        }    }    /**     * overwrite tostring.     * @return string     */    public function __toString() {        // TODO: Implement __toString() method.        return json_encode($this->nodeTree);    }    /**     * 检索     * @param $search     * @return array|bool     */    public function search($search) {        $search = trim($search);        if (empty($search)) {            return false;        }        $search_keys = $this->convertStrToH($search);        //命中集合        $hit_arr = array();        $tree = &$this->nodeTree;        $arr_len = count($search_keys);        $current_index = 0;        for ($i = 0; $i < $arr_len; $i++) {            //若命中了一个索引 则继续向下寻找            if (isset($tree[$search_keys[$i]])) {                $node = $tree[$search_keys[$i]];                if ($node['end']) {                    //发现结尾 将原词以及自定义属性纳入返回结果集中 3.1 增加词频统计                    $key = md5($node['full']);                    if (isset($hit_arr[$key])) {                        $hit_arr[$key]['count'] += 1;                    } else {                        $hit_arr[$key] = array(                            'word' => $node['full'],                            'data' => $node['data'],                            'count' => 1                        );                    }                    if (empty($node['child'])) {                        //若不存在子集,检索游标还原                        $i = $current_index;                        //还原检索集合                        $tree = &$this->nodeTree;                        //字码游标下移                        $current_index++;                    } else {                        //存在子集重定义检索tree                        $tree = &$tree[$search_keys[$i]]['child'];                    }                    continue;                } else {                    //没发现结尾继续向下检索                    $tree = &$tree[$search_keys[$i]]['child'];                    continue;                }            } else {                //还原检索起点                $i = $current_index;                //还原tree                $tree = &$this->nodeTree;                //字码位移                $current_index++;                continue;            }        }        unset($tree, $search_keys);        return $hit_arr;    }    /**     * 将字符转为16进制标示     * @param $str     * @return array     */    public function convertStrToH($str) {        $len = strlen($str);        $chars = [];        for ($i = 0; $i < $len; $i++) {            $ascii_code = ord($str[$i]);            if (($ascii_code >> 7) == 0) {                $chars[] = dechex(ord($str[$i]));            } else if (($ascii_code >> 4) == 15) {    //1111 xxxx, 四字节                if ($i < $len - 3) {                    $chars[] = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2])) . dechex(ord($str[$i + 3]));                    $i += 3;                }            } else if (($ascii_code >> 5) == 7) {    //111x xxxx, 三字节                if ($i < $len - 2) {                    $chars[] = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2]));                    $i += 2;                }            } else if (($ascii_code >> 6) == 3) {    //11xx xxxx, 2字节                if ($i < $len - 1) {                    $chars[] = dechex(ord($str[$i])) . dechex(ord($str[$i + 1]));                    $i++;                }            }        }        return $chars;    }}
 |