TrieTree.php 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396
  1. <?php
  2. /**
  3. *
  4. * Author: abel
  5. * Date: 17/4/10
  6. * Time: 17:26
  7. */
  8. namespace AbelZhou\Tree;
  9. /**
  10. * 这是一个支持中文的字典树
  11. * Class TrieTree
  12. * @package AbelZhou\Tree
  13. */
  14. class TrieTree {
  15. protected $nodeTree = [];
  16. /**
  17. * 构造
  18. * TrieTree constructor.
  19. */
  20. public function __construct() {
  21. }
  22. /**
  23. * 从树种摘除一个文本
  24. * @param $index_str
  25. */
  26. public function delete($index_str, $deltree = false, $is_py = false, $chinese = "") {
  27. $str = trim($index_str);
  28. $chinese = trim($chinese);
  29. if ($is_py && empty($chinese)) {
  30. return false;
  31. }
  32. $delstr_arr = $this->convertStrToH($str);
  33. $len = count($delstr_arr);
  34. //提取树
  35. $childTree = &$this->nodeTree;
  36. $del_index = array();
  37. //提取树中的相关索引
  38. for ($i = 0; $i < $len; $i++) {
  39. $code = $delstr_arr[$i];
  40. //命中将其纳入索引范围
  41. if (isset($childTree[$code])) {
  42. //del tree
  43. $del_index[$i] = [
  44. 'code' => $code,
  45. 'index' => &$childTree[$code]
  46. ];
  47. //若检索到最后一个字,检查是否是一个关键词的末梢
  48. if ($i == ($len - 1) && !$childTree[$code]['end']) {
  49. return false;
  50. }
  51. $childTree = &$childTree[$code]['child'];
  52. } else {
  53. //发现没有命中 删除失败
  54. return false;
  55. }
  56. }
  57. $idx = $len - 1;
  58. //删除整棵树
  59. if ($deltree) {
  60. //清空子集
  61. $del_index[$idx]['index']['child'] = array();
  62. }
  63. //只有一个字 直接删除
  64. if ($idx == 0) {
  65. //如果是拼音 只删除相应的拼音索引
  66. if ($is_py) {
  67. //清除单个拼音索引
  68. if (isset($this->nodeTree[$del_index[$idx]['code']]['chinese_list'])) {
  69. $is_del = false;
  70. foreach ($this->nodeTree[$del_index[$idx]['code']]['chinese_list'] as $key=>$node) {
  71. if ($node['word'] == $chinese){
  72. unset($this->nodeTree[$del_index[$idx]['code']]['chinese_list'][$key]);
  73. $is_del = true;
  74. break;
  75. }
  76. }
  77. if($is_del && 0 != count($this->nodeTree[$del_index[$idx]['code']]['chinese_list'])){
  78. return true;
  79. }
  80. if(!$is_del){
  81. return false;
  82. }
  83. //如果依然存在中文数据 则继续向下跑删除节点
  84. }
  85. }else{
  86. if (count($del_index[$idx]['index']['child']) == 0) {
  87. unset($this->nodeTree[$del_index[$idx]['code']]);
  88. return true;
  89. }
  90. }
  91. }
  92. //末梢为关键词结尾,且存在子集 清除结尾标签
  93. if (count($del_index[$idx]['index']['child']) > 0) {
  94. $del_index[$idx]['index']['end'] = false;
  95. $del_index[$idx]['index']['data'] = array();
  96. unset($del_index[$idx]['index']['full']);
  97. return true;
  98. }
  99. // var_dump($this->nodeTree['e59bbd']);exit();
  100. //以下为末梢不存在子集的情况
  101. //倒序检索 子集大于2的 清除child
  102. for (; $idx >= 0; $idx--) {
  103. //检测子集 若发现联字情况 检测是否为其他关键词结尾
  104. if (count($del_index[$idx]['index']['child']) > 0) {
  105. //遇到结束标记或者count>1的未结束节点直接清空子集跳出
  106. if ($del_index[$idx]['index']['end'] == true || $del_index[$idx]['index']['child'] > 1) {
  107. //清空子集
  108. $child_code = $del_index[$idx + 1]['code'];
  109. unset($del_index[$idx]['index']['child'][$child_code]);
  110. return true;
  111. }
  112. }
  113. }
  114. return false;
  115. }
  116. /**
  117. * ADD word [UTF8]
  118. * 增加新特性,在质感末梢增加自定义数组
  119. * @param $index_str 添加的词
  120. * @param array $data 添加词的附加属性
  121. * @return $this
  122. */
  123. public function append($index_str, $data = array(), $is_py = false, $chinese = '') {
  124. $str = trim($index_str);
  125. $chinese = trim($chinese);
  126. if ($is_py && empty($chinese)) {
  127. return false;
  128. }
  129. $childTree = &$this->nodeTree;
  130. $len = strlen($str);
  131. for ($i = 0; $i < $len; $i++) {
  132. $ascii_code = ord($str[$i]);
  133. $code = NULL;
  134. $word = NULL;
  135. $is_end = false;
  136. if (($ascii_code >> 7) == 0) {
  137. $code = dechex(ord($str[$i]));
  138. $word = $str[$i];
  139. } else if (($ascii_code >> 4) == 15) { //1111 xxxx, 四字节
  140. if ($i < $len - 3) {
  141. $code = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2])) . dechex(ord($str[$i + 3]));
  142. $word = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
  143. $i += 3;
  144. }
  145. } else if (($ascii_code >> 5) == 7) { //111x xxxx, 三字节
  146. if ($i < $len - 2) {
  147. $code = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2]));
  148. $word = $str[$i] . $str[$i + 1] . $str[$i + 2];
  149. $i += 2;
  150. }
  151. } else if (($ascii_code >> 6) == 3) { //11xx xxxx, 2字节
  152. if ($i < $len - 1) {
  153. $code = dechex(ord($str[$i])) . dechex(ord($str[$i + 1]));
  154. $word = $str[$i] . $str[$i + 1];
  155. $i++;
  156. }
  157. }
  158. if ($i == ($len - 1)) {
  159. $is_end = true;
  160. if ($is_py) {
  161. $str = $chinese;
  162. }
  163. }
  164. $childTree = &$this->_appendWordToTree($childTree, $code, $word, $is_end, $data, $str, $is_py);
  165. }
  166. unset($childTree);
  167. return $this;
  168. }
  169. /**
  170. * 追加一个字[中英文]到树中
  171. * @param $tree
  172. * @param $code
  173. * @param $word
  174. * @param bool $end
  175. * @param array $data
  176. * @param string $full_str
  177. * @return mixed
  178. */
  179. private function &_appendWordToTree(&$tree, $code, $word, $end = false, $data = array(), $full_str = '', $is_py) {
  180. if (!isset($tree[$code])) {
  181. $tree[$code] = array(
  182. 'end' => $end,
  183. 'child' => array(),
  184. 'value' => $word,
  185. );
  186. }
  187. if ($end) {
  188. $tree[$code]['end'] = true;
  189. $tree[$code]['is_py'] = $is_py;
  190. //拼音不需要full 拼音根据读音多样性对应多个词语 重复词语覆盖data
  191. if ($is_py) {
  192. $is_change = false;
  193. if(isset($tree[$code]["chinese_list"]) && count($tree[$code]["chinese_list"])>0) {
  194. foreach ($tree[$code]["chinese_list"] as $key => &$node) {
  195. if ($node['word'] == $full_str) {
  196. $node['data'] = $data;
  197. $is_change = true;
  198. break;
  199. }
  200. }
  201. }
  202. if(!$is_change){
  203. $tree[$code]['chinese_list'][] = ["word" => $full_str, "data" => $data];
  204. }
  205. } else {
  206. $tree[$code]['full'] = $full_str;
  207. $tree[$code]['data'] = $data;
  208. }
  209. }
  210. return $tree[$code]['child'];
  211. }
  212. /**
  213. * 获得整棵树
  214. * @return array
  215. */
  216. public function getTree() {
  217. return $this->nodeTree;
  218. }
  219. /**
  220. * 匹配下面的全部词语
  221. * @param $word
  222. * @param int $deep 检索深度 检索之后的词语数量可能会大于这个数字
  223. * @return array|bool
  224. */
  225. public function getTreeWord($word, $deep = 0) {
  226. $search = trim($word);
  227. if (empty($search)) {
  228. return false;
  229. }
  230. if ($deep === 0) {
  231. $deep = 999;
  232. }
  233. $word_keys = $this->convertStrToH($search);
  234. $tree = &$this->nodeTree;
  235. $key_count = count($word_keys);
  236. $words = [];
  237. foreach ($word_keys as $key => $val) {
  238. if (isset($tree[$val])) {
  239. //检测当前词语是否已命中
  240. if ($key == $key_count - 1 && $tree[$val]['end'] == true) {
  241. if (isset($tree[$val]['chinese_list'])) {
  242. $words = array_merge($words, $tree[$val]['chinese_list']);
  243. } else {
  244. $words[] = ["word" => $tree[$val]['full'], "data" => $tree[$val]['data']];
  245. }
  246. }
  247. $tree = &$tree[$val]["child"];
  248. } else {
  249. //遇到没有命中的返回
  250. // if ($key == 0) {
  251. return [];
  252. // }
  253. }
  254. }
  255. $this->_getTreeWord($tree, $deep, $words);
  256. return $words;
  257. }
  258. private function _getTreeWord(&$child, $deep, &$words = array()) {
  259. foreach ($child as $node) {
  260. if ($node['end'] == true) {
  261. if (isset($node['chinese_list'])) {
  262. $words = array_merge($words, $node['chinese_list']);
  263. } else {
  264. $words[] = ["word" => $node['full'], "data" => $node['data']];
  265. }
  266. }
  267. if (!empty($node['child']) && $deep >= count($words)) {
  268. $this->_getTreeWord($node['child'], $deep, $words);
  269. }
  270. }
  271. }
  272. /**
  273. * overwrite tostring.
  274. * @return string
  275. */
  276. public function __toString() {
  277. // TODO: Implement __toString() method.
  278. return json_encode($this->nodeTree);
  279. }
  280. /**
  281. * 检索
  282. * @param $search
  283. * @return array|bool
  284. */
  285. public function search($search) {
  286. $search = trim($search);
  287. if (empty($search)) {
  288. return false;
  289. }
  290. $search_keys = $this->convertStrToH($search);
  291. //命中集合
  292. $hit_arr = array();
  293. $tree = &$this->nodeTree;
  294. $arr_len = count($search_keys);
  295. $current_index = 0;
  296. for ($i = 0; $i < $arr_len; $i++) {
  297. //若命中了一个索引 则继续向下寻找
  298. if (isset($tree[$search_keys[$i]])) {
  299. $node = $tree[$search_keys[$i]];
  300. if ($node['end']) {
  301. //发现结尾 将原词以及自定义属性纳入返回结果集中 3.1 增加词频统计
  302. $key = md5($node['full']);
  303. if (isset($hit_arr[$key])) {
  304. $hit_arr[$key]['count'] += 1;
  305. } else {
  306. $hit_arr[$key] = array(
  307. 'word' => $node['full'],
  308. 'data' => $node['data'],
  309. 'count' => 1
  310. );
  311. }
  312. if (empty($node['child'])) {
  313. //若不存在子集,检索游标还原
  314. $i = $current_index;
  315. //还原检索集合
  316. $tree = &$this->nodeTree;
  317. //字码游标下移
  318. $current_index++;
  319. } else {
  320. //存在子集重定义检索tree
  321. $tree = &$tree[$search_keys[$i]]['child'];
  322. }
  323. continue;
  324. } else {
  325. //没发现结尾继续向下检索
  326. $tree = &$tree[$search_keys[$i]]['child'];
  327. continue;
  328. }
  329. } else {
  330. //还原检索起点
  331. $i = $current_index;
  332. //还原tree
  333. $tree = &$this->nodeTree;
  334. //字码位移
  335. $current_index++;
  336. continue;
  337. }
  338. }
  339. unset($tree, $search_keys);
  340. return $hit_arr;
  341. }
  342. /**
  343. * 将字符转为16进制标示
  344. * @param $str
  345. * @return array
  346. */
  347. public function convertStrToH($str) {
  348. $len = strlen($str);
  349. $chars = [];
  350. for ($i = 0; $i < $len; $i++) {
  351. $ascii_code = ord($str[$i]);
  352. if (($ascii_code >> 7) == 0) {
  353. $chars[] = dechex(ord($str[$i]));
  354. } else if (($ascii_code >> 4) == 15) { //1111 xxxx, 四字节
  355. if ($i < $len - 3) {
  356. $chars[] = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2])) . dechex(ord($str[$i + 3]));
  357. $i += 3;
  358. }
  359. } else if (($ascii_code >> 5) == 7) { //111x xxxx, 三字节
  360. if ($i < $len - 2) {
  361. $chars[] = dechex(ord($str[$i])) . dechex(ord($str[$i + 1])) . dechex(ord($str[$i + 2]));
  362. $i += 2;
  363. }
  364. } else if (($ascii_code >> 6) == 3) { //11xx xxxx, 2字节
  365. if ($i < $len - 1) {
  366. $chars[] = dechex(ord($str[$i])) . dechex(ord($str[$i + 1]));
  367. $i++;
  368. }
  369. }
  370. }
  371. return $chars;
  372. }
  373. }