source: trunk/forum/includes/search/fulltext_native.php@ 482

Last change on this file since 482 was 400, checked in by george, 16 years ago
  • Přidáno: Nové forum phpBB 3.
File size: 45.7 KB
Line 
1<?php
2/**
3*
4* @package search
5* @version $Id: fulltext_native.php 9173 2008-12-04 17:01:39Z naderman $
6* @copyright (c) 2005 phpBB Group
7* @license http://opensource.org/licenses/gpl-license.php GNU Public License
8*
9*/
10
11/**
12* @ignore
13*/
14if (!defined('IN_PHPBB'))
15{
16 exit;
17}
18
19/**
20* @ignore
21*/
22include_once($phpbb_root_path . 'includes/search/search.' . $phpEx);
23
24/**
25* fulltext_native
26* phpBB's own db driven fulltext search, version 2
27* @package search
28*/
29class fulltext_native extends search_backend
30{
31 var $stats = array();
32 var $word_length = array();
33 var $search_query;
34 var $common_words = array();
35
36 var $must_contain_ids = array();
37 var $must_not_contain_ids = array();
38 var $must_exclude_one_ids = array();
39
40 /**
41 * Initialises the fulltext_native search backend with min/max word length and makes sure the UTF-8 normalizer is loaded.
42 *
43 * @param boolean|string &$error is passed by reference and should either be set to false on success or an error message on failure.
44 *
45 * @access public
46 */
47 function fulltext_native(&$error)
48 {
49 global $phpbb_root_path, $phpEx, $config;
50
51 $this->word_length = array('min' => $config['fulltext_native_min_chars'], 'max' => $config['fulltext_native_max_chars']);
52
53 /**
54 * Load the UTF tools
55 */
56 if (!class_exists('utf_normalizer'))
57 {
58 include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
59 }
60
61
62 $error = false;
63 }
64
65 /**
66 * This function fills $this->search_query with the cleaned user search query.
67 *
68 * If $terms is 'any' then the words will be extracted from the search query
69 * and combined with | inside brackets. They will afterwards be treated like
70 * an standard search query.
71 *
72 * Then it analyses the query and fills the internal arrays $must_not_contain_ids,
73 * $must_contain_ids and $must_exclude_one_ids which are later used by keyword_search().
74 *
75 * @param string $keywords contains the search query string as entered by the user
76 * @param string $terms is either 'all' (use search query as entered, default words to 'must be contained in post')
77 * or 'any' (find all posts containing at least one of the given words)
78 * @return boolean false if no valid keywords were found and otherwise true
79 *
80 * @access public
81 */
82 function split_keywords($keywords, $terms)
83 {
84 global $db, $user;
85
86 $keywords = trim($this->cleanup($keywords, '+-|()*'));
87
88 // allow word|word|word without brackets
89 if ((strpos($keywords, ' ') === false) && (strpos($keywords, '|') !== false) && (strpos($keywords, '(') === false))
90 {
91 $keywords = '(' . $keywords . ')';
92 }
93
94 $open_bracket = $space = false;
95 for ($i = 0, $n = strlen($keywords); $i < $n; $i++)
96 {
97 if ($open_bracket !== false)
98 {
99 switch ($keywords[$i])
100 {
101 case ')':
102 if ($open_bracket + 1 == $i)
103 {
104 $keywords[$i - 1] = '|';
105 $keywords[$i] = '|';
106 }
107 $open_bracket = false;
108 break;
109 case '(':
110 $keywords[$i] = '|';
111 break;
112 case '+':
113 case '-':
114 case ' ':
115 $keywords[$i] = '|';
116 break;
117 }
118 }
119 else
120 {
121 switch ($keywords[$i])
122 {
123 case ')':
124 $keywords[$i] = ' ';
125 break;
126 case '(':
127 $open_bracket = $i;
128 $space = false;
129 break;
130 case '|':
131 $keywords[$i] = ' ';
132 break;
133 case '-':
134 case '+':
135 $space = $keywords[$i];
136 break;
137 case ' ':
138 if ($space !== false)
139 {
140 $keywords[$i] = $space;
141 }
142 break;
143 default:
144 $space = false;
145 }
146 }
147 }
148
149 if ($open_bracket)
150 {
151 $keywords .= ')';
152 }
153
154 $match = array(
155 '# +#',
156 '#\|\|+#',
157 '#(\+|\-)(?:\+|\-)+#',
158 '#\(\|#',
159 '#\|\)#',
160 );
161 $replace = array(
162 ' ',
163 '|',
164 '$1',
165 '(',
166 ')',
167 );
168
169 $keywords = preg_replace($match, $replace, $keywords);
170
171 // $keywords input format: each word separated by a space, words in a bracket are not separated
172
173 // the user wants to search for any word, convert the search query
174 if ($terms == 'any')
175 {
176 $words = array();
177
178 preg_match_all('#([^\\s+\\-|()]+)(?:$|[\\s+\\-|()])#u', $keywords, $words);
179 if (sizeof($words[1]))
180 {
181 $keywords = '(' . implode('|', $words[1]) . ')';
182 }
183 }
184
185 // set the search_query which is shown to the user
186 $this->search_query = $keywords;
187
188 $exact_words = array();
189 preg_match_all('#([^\\s+\\-|*()]+)(?:$|[\\s+\\-|()])#u', $keywords, $exact_words);
190 $exact_words = $exact_words[1];
191
192 $common_ids = $words = array();
193
194 if (sizeof($exact_words))
195 {
196 $sql = 'SELECT word_id, word_text, word_common
197 FROM ' . SEARCH_WORDLIST_TABLE . '
198 WHERE ' . $db->sql_in_set('word_text', $exact_words);
199 $result = $db->sql_query($sql);
200
201 // store an array of words and ids, remove common words
202 while ($row = $db->sql_fetchrow($result))
203 {
204 if ($row['word_common'])
205 {
206 $this->common_words[] = $row['word_text'];
207 $common_ids[$row['word_text']] = (int) $row['word_id'];
208 continue;
209 }
210
211 $words[$row['word_text']] = (int) $row['word_id'];
212 }
213 $db->sql_freeresult($result);
214 }
215 unset($exact_words);
216
217 // now analyse the search query, first split it using the spaces
218 $query = explode(' ', $keywords);
219
220 $this->must_contain_ids = array();
221 $this->must_not_contain_ids = array();
222 $this->must_exclude_one_ids = array();
223
224 $mode = '';
225 $ignore_no_id = true;
226
227 foreach ($query as $word)
228 {
229 if (empty($word))
230 {
231 continue;
232 }
233
234 // words which should not be included
235 if ($word[0] == '-')
236 {
237 $word = substr($word, 1);
238
239 // a group of which at least one may not be in the resulting posts
240 if ($word[0] == '(')
241 {
242 $word = array_unique(explode('|', substr($word, 1, -1)));
243 $mode = 'must_exclude_one';
244 }
245 // one word which should not be in the resulting posts
246 else
247 {
248 $mode = 'must_not_contain';
249 }
250 $ignore_no_id = true;
251 }
252 // words which have to be included
253 else
254 {
255 // no prefix is the same as a +prefix
256 if ($word[0] == '+')
257 {
258 $word = substr($word, 1);
259 }
260
261 // a group of words of which at least one word should be in every resulting post
262 if ($word[0] == '(')
263 {
264 $word = array_unique(explode('|', substr($word, 1, -1)));
265 }
266 $ignore_no_id = false;
267 $mode = 'must_contain';
268 }
269
270 if (empty($word))
271 {
272 continue;
273 }
274
275 // if this is an array of words then retrieve an id for each
276 if (is_array($word))
277 {
278 $non_common_words = array();
279 $id_words = array();
280 foreach ($word as $i => $word_part)
281 {
282 if (strpos($word_part, '*') !== false)
283 {
284 $id_words[] = '\'' . $db->sql_escape(str_replace('*', '%', $word_part)) . '\'';
285 $non_common_words[] = $word_part;
286 }
287 else if (isset($words[$word_part]))
288 {
289 $id_words[] = $words[$word_part];
290 $non_common_words[] = $word_part;
291 }
292 else
293 {
294 $len = utf8_strlen($word_part);
295 if ($len < $this->word_length['min'] || $len > $this->word_length['max'])
296 {
297 $this->common_words[] = $word_part;
298 }
299 }
300 }
301 if (sizeof($id_words))
302 {
303 sort($id_words);
304 if (sizeof($id_words) > 1)
305 {
306 $this->{$mode . '_ids'}[] = $id_words;
307 }
308 else
309 {
310 $mode = ($mode == 'must_exclude_one') ? 'must_not_contain' : $mode;
311 $this->{$mode . '_ids'}[] = $id_words[0];
312 }
313 }
314 // throw an error if we shall not ignore unexistant words
315 else if (!$ignore_no_id && sizeof($non_common_words))
316 {
317 trigger_error(sprintf($user->lang['WORDS_IN_NO_POST'], implode(', ', $non_common_words)));
318 }
319 unset($non_common_words);
320 }
321 // else we only need one id
322 else if (($wildcard = strpos($word, '*') !== false) || isset($words[$word]))
323 {
324 if ($wildcard)
325 {
326 $len = utf8_strlen(str_replace('*', '', $word));
327 if ($len >= $this->word_length['min'] && $len <= $this->word_length['max'])
328 {
329 $this->{$mode . '_ids'}[] = '\'' . $db->sql_escape(str_replace('*', '%', $word)) . '\'';
330 }
331 else
332 {
333 $this->common_words[] = $word;
334 }
335 }
336 else
337 {
338 $this->{$mode . '_ids'}[] = $words[$word];
339 }
340 }
341 // throw an error if we shall not ignore unexistant words
342 else if (!$ignore_no_id)
343 {
344 if (!isset($common_ids[$word]))
345 {
346 $len = utf8_strlen($word);
347 if ($len >= $this->word_length['min'] && $len <= $this->word_length['max'])
348 {
349 trigger_error(sprintf($user->lang['WORD_IN_NO_POST'], $word));
350 }
351 else
352 {
353 $this->common_words[] = $word;
354 }
355 }
356 }
357 else
358 {
359 $len = utf8_strlen($word);
360 if ($len < $this->word_length['min'] || $len > $this->word_length['max'])
361 {
362 $this->common_words[] = $word;
363 }
364 }
365 }
366
367 // we can't search for negatives only
368 if (!sizeof($this->must_contain_ids))
369 {
370 return false;
371 }
372
373 sort($this->must_contain_ids);
374 sort($this->must_not_contain_ids);
375 sort($this->must_exclude_one_ids);
376
377 if (!empty($this->search_query))
378 {
379 return true;
380 }
381 return false;
382 }
383
384 /**
385 * Performs a search on keywords depending on display specific params. You have to run split_keywords() first.
386 *
387 * @param string $type contains either posts or topics depending on what should be searched for
388 * @param string &$fields contains either titleonly (topic titles should be searched), msgonly (only message bodies should be searched), firstpost (only subject and body of the first post should be searched) or all (all post bodies and subjects should be searched)
389 * @param string &$terms is either 'all' (use query as entered, words without prefix should default to "have to be in field") or 'any' (ignore search query parts and just return all posts that contain any of the specified words)
390 * @param array &$sort_by_sql contains SQL code for the ORDER BY part of a query
391 * @param string &$sort_key is the key of $sort_by_sql for the selected sorting
392 * @param string &$sort_dir is either a or d representing ASC and DESC
393 * @param string &$sort_days specifies the maximum amount of days a post may be old
394 * @param array &$ex_fid_ary specifies an array of forum ids which should not be searched
395 * @param array &$m_approve_fid_ary specifies an array of forum ids in which the searcher is allowed to view unapproved posts
396 * @param int &$topic_id is set to 0 or a topic id, if it is not 0 then only posts in this topic should be searched
397 * @param array &$author_ary an array of author ids if the author should be ignored during the search the array is empty
398 * @param array &$id_ary passed by reference, to be filled with ids for the page specified by $start and $per_page, should be ordered
399 * @param int $start indicates the first index of the page
400 * @param int $per_page number of ids each page is supposed to contain
401 * @return boolean|int total number of results
402 *
403 * @access public
404 */
405 function keyword_search($type, &$fields, &$terms, &$sort_by_sql, &$sort_key, &$sort_dir, &$sort_days, &$ex_fid_ary, &$m_approve_fid_ary, &$topic_id, &$author_ary, &$id_ary, $start, $per_page)
406 {
407 global $config, $db;
408
409 // No keywords? No posts.
410 if (empty($this->search_query))
411 {
412 return false;
413 }
414
415 // generate a search_key from all the options to identify the results
416 $search_key = md5(implode('#', array(
417 serialize($this->must_contain_ids),
418 serialize($this->must_not_contain_ids),
419 serialize($this->must_exclude_one_ids),
420 $type,
421 $fields,
422 $terms,
423 $sort_days,
424 $sort_key,
425 $topic_id,
426 implode(',', $ex_fid_ary),
427 implode(',', $m_approve_fid_ary),
428 implode(',', $author_ary)
429 )));
430
431 // try reading the results from cache
432 $total_results = 0;
433 if ($this->obtain_ids($search_key, $total_results, $id_ary, $start, $per_page, $sort_dir) == SEARCH_RESULT_IN_CACHE)
434 {
435 return $total_results;
436 }
437
438 $id_ary = array();
439
440 $sql_where = array();
441 $group_by = false;
442 $m_num = 0;
443 $w_num = 0;
444
445 $sql_array = array(
446 'SELECT' => ($type == 'posts') ? 'p.post_id' : 'p.topic_id',
447 'FROM' => array(
448 SEARCH_WORDMATCH_TABLE => array(),
449 SEARCH_WORDLIST_TABLE => array(),
450 ),
451 'LEFT_JOIN' => array(array(
452 'FROM' => array(POSTS_TABLE => 'p'),
453 'ON' => 'm0.post_id = p.post_id',
454 )),
455 );
456
457 $title_match = '';
458 $left_join_topics = false;
459 $group_by = true;
460 // Build some display specific sql strings
461 switch ($fields)
462 {
463 case 'titleonly':
464 $title_match = 'title_match = 1';
465 $group_by = false;
466 // no break
467 case 'firstpost':
468 $left_join_topics = true;
469 $sql_where[] = 'p.post_id = t.topic_first_post_id';
470 break;
471
472 case 'msgonly':
473 $title_match = 'title_match = 0';
474 $group_by = false;
475 break;
476 }
477
478 if ($type == 'topics')
479 {
480 $left_join_topics = true;
481 $group_by = true;
482 }
483
484 /**
485 * @todo Add a query optimizer (handle stuff like "+(4|3) +4")
486 */
487
488 foreach ($this->must_contain_ids as $subquery)
489 {
490 if (is_array($subquery))
491 {
492 $group_by = true;
493
494 $word_id_sql = array();
495 $word_ids = array();
496 foreach ($subquery as $id)
497 {
498 if (is_string($id))
499 {
500 $sql_array['LEFT_JOIN'][] = array(
501 'FROM' => array(SEARCH_WORDLIST_TABLE => 'w' . $w_num),
502 'ON' => "w$w_num.word_text LIKE $id"
503 );
504 $word_ids[] = "w$w_num.word_id";
505
506 $w_num++;
507 }
508 else
509 {
510 $word_ids[] = $id;
511 }
512 }
513
514 $sql_where[] = $db->sql_in_set("m$m_num.word_id", $word_ids);
515
516 unset($word_id_sql);
517 unset($word_ids);
518 }
519 else if (is_string($subquery))
520 {
521 $sql_array['FROM'][SEARCH_WORDLIST_TABLE][] = 'w' . $w_num;
522
523 $sql_where[] = "w$w_num.word_text LIKE $subquery";
524 $sql_where[] = "m$m_num.word_id = w$w_num.word_id";
525
526 $group_by = true;
527 $w_num++;
528 }
529 else
530 {
531 $sql_where[] = "m$m_num.word_id = $subquery";
532 }
533
534 $sql_array['FROM'][SEARCH_WORDMATCH_TABLE][] = 'm' . $m_num;
535
536 if ($title_match)
537 {
538 $sql_where[] = "m$m_num.$title_match";
539 }
540
541 if ($m_num != 0)
542 {
543 $sql_where[] = "m$m_num.post_id = m0.post_id";
544 }
545 $m_num++;
546 }
547
548 foreach ($this->must_not_contain_ids as $key => $subquery)
549 {
550 if (is_string($subquery))
551 {
552 $sql_array['LEFT_JOIN'][] = array(
553 'FROM' => array(SEARCH_WORDLIST_TABLE => 'w' . $w_num),
554 'ON' => "w$w_num.word_text LIKE $subquery"
555 );
556
557 $this->must_not_contain_ids[$key] = "w$w_num.word_id";
558
559 $group_by = true;
560 $w_num++;
561 }
562 }
563
564 if (sizeof($this->must_not_contain_ids))
565 {
566 $sql_array['LEFT_JOIN'][] = array(
567 'FROM' => array(SEARCH_WORDMATCH_TABLE => 'm' . $m_num),
568 'ON' => $db->sql_in_set("m$m_num.word_id", $this->must_not_contain_ids) . (($title_match) ? " AND m$m_num.$title_match" : '') . " AND m$m_num.post_id = m0.post_id"
569 );
570
571 $sql_where[] = "m$m_num.word_id IS NULL";
572 $m_num++;
573 }
574
575 foreach ($this->must_exclude_one_ids as $ids)
576 {
577 $is_null_joins = array();
578 foreach ($ids as $id)
579 {
580 if (is_string($id))
581 {
582 $sql_array['LEFT_JOIN'][] = array(
583 'FROM' => array(SEARCH_WORDLIST_TABLE => 'w' . $w_num),
584 'ON' => "w$w_num.word_text LIKE $id"
585 );
586 $id = "w$w_num.word_id";
587
588 $group_by = true;
589 $w_num++;
590 }
591
592 $sql_array['LEFT_JOIN'][] = array(
593 'FROM' => array(SEARCH_WORDMATCH_TABLE => 'm' . $m_num),
594 'ON' => "m$m_num.word_id = $id AND m$m_num.post_id = m0.post_id" . (($title_match) ? " AND m$m_num.$title_match" : '')
595 );
596 $is_null_joins[] = "m$m_num.word_id IS NULL";
597
598 $m_num++;
599 }
600 $sql_where[] = '(' . implode(' OR ', $is_null_joins) . ')';
601 }
602
603 if (!sizeof($m_approve_fid_ary))
604 {
605 $sql_where[] = 'p.post_approved = 1';
606 }
607 else if ($m_approve_fid_ary !== array(-1))
608 {
609 $sql_where[] = '(p.post_approved = 1 OR ' . $db->sql_in_set('p.forum_id', $m_approve_fid_ary, true) . ')';
610 }
611
612 if ($topic_id)
613 {
614 $sql_where[] = 'p.topic_id = ' . $topic_id;
615 }
616
617 if (sizeof($author_ary))
618 {
619 $sql_where[] = $db->sql_in_set('p.poster_id', $author_ary);
620 }
621
622 if (sizeof($ex_fid_ary))
623 {
624 $sql_where[] = $db->sql_in_set('p.forum_id', $ex_fid_ary, true);
625 }
626
627 if ($sort_days)
628 {
629 $sql_where[] = 'p.post_time >= ' . (time() - ($sort_days * 86400));
630 }
631
632 $sql_array['WHERE'] = implode(' AND ', $sql_where);
633
634 $is_mysql = false;
635 // if the total result count is not cached yet, retrieve it from the db
636 if (!$total_results)
637 {
638 $sql = '';
639 $sql_array_count = $sql_array;
640
641 switch ($db->sql_layer)
642 {
643 case 'mysql4':
644 case 'mysqli':
645
646 // 3.x does not support SQL_CALC_FOUND_ROWS
647 $sql_array['SELECT'] = 'SQL_CALC_FOUND_ROWS ' . $sql_array['SELECT'];
648 $is_mysql = true;
649
650 break;
651
652 case 'sqlite':
653 $sql_array_count['SELECT'] = ($type == 'posts') ? 'DISTINCT p.post_id' : 'DISTINCT p.topic_id';
654 $sql = 'SELECT COUNT(' . (($type == 'posts') ? 'post_id' : 'topic_id') . ') as total_results
655 FROM (' . $db->sql_build_query('SELECT', $sql_array_count) . ')';
656
657 // no break
658
659 default:
660 $sql_array_count['SELECT'] = ($type == 'posts') ? 'COUNT(DISTINCT p.post_id) AS total_results' : 'COUNT(DISTINCT p.topic_id) AS total_results';
661 $sql = (!$sql) ? $db->sql_build_query('SELECT', $sql_array_count) : $sql;
662
663 $result = $db->sql_query($sql);
664 $total_results = (int) $db->sql_fetchfield('total_results');
665 $db->sql_freeresult($result);
666
667 if (!$total_results)
668 {
669 return false;
670 }
671 break;
672 }
673
674 unset($sql_array_count, $sql);
675 }
676
677 // Build sql strings for sorting
678 $sql_sort = $sort_by_sql[$sort_key] . (($sort_dir == 'a') ? ' ASC' : ' DESC');
679
680 switch ($sql_sort[0])
681 {
682 case 'u':
683 $sql_array['FROM'][USERS_TABLE] = 'u';
684 $sql_where[] = 'u.user_id = p.poster_id ';
685 break;
686
687 case 't':
688 $left_join_topics = true;
689 break;
690
691 case 'f':
692 $sql_array['FROM'][FORUMS_TABLE] = 'f';
693 $sql_where[] = 'f.forum_id = p.forum_id';
694 break;
695 }
696
697 if ($left_join_topics)
698 {
699 $sql_array['LEFT_JOIN'][$left_join_topics] = array(
700 'FROM' => array(TOPICS_TABLE => 't'),
701 'ON' => 'p.topic_id = t.topic_id'
702 );
703 }
704
705 $sql_array['WHERE'] = implode(' AND ', $sql_where);
706 $sql_array['GROUP_BY'] = ($group_by) ? (($type == 'posts') ? 'p.post_id' : 'p.topic_id') . ', ' . $sort_by_sql[$sort_key] : '';
707 $sql_array['ORDER_BY'] = $sql_sort;
708
709 unset($sql_where, $sql_sort, $group_by);
710
711 $sql = $db->sql_build_query('SELECT', $sql_array);
712 $result = $db->sql_query_limit($sql, $config['search_block_size'], $start);
713
714 while ($row = $db->sql_fetchrow($result))
715 {
716 $id_ary[] = $row[(($type == 'posts') ? 'post_id' : 'topic_id')];
717 }
718 $db->sql_freeresult($result);
719
720 if (!sizeof($id_ary))
721 {
722 return false;
723 }
724
725 // if we use mysql and the total result count is not cached yet, retrieve it from the db
726 if (!$total_results && $is_mysql)
727 {
728 $sql = 'SELECT FOUND_ROWS() as total_results';
729 $result = $db->sql_query($sql);
730 $total_results = (int) $db->sql_fetchfield('total_results');
731 $db->sql_freeresult($result);
732
733 if (!$total_results)
734 {
735 return false;
736 }
737 }
738
739 // store the ids, from start on then delete anything that isn't on the current page because we only need ids for one page
740 $this->save_ids($search_key, $this->search_query, $author_ary, $total_results, $id_ary, $start, $sort_dir);
741 $id_ary = array_slice($id_ary, 0, (int) $per_page);
742
743 return $total_results;
744 }
745
746 /**
747 * Performs a search on an author's posts without caring about message contents. Depends on display specific params
748 *
749 * @param string $type contains either posts or topics depending on what should be searched for
750 * @param boolean $firstpost_only if true, only topic starting posts will be considered
751 * @param array &$sort_by_sql contains SQL code for the ORDER BY part of a query
752 * @param string &$sort_key is the key of $sort_by_sql for the selected sorting
753 * @param string &$sort_dir is either a or d representing ASC and DESC
754 * @param string &$sort_days specifies the maximum amount of days a post may be old
755 * @param array &$ex_fid_ary specifies an array of forum ids which should not be searched
756 * @param array &$m_approve_fid_ary specifies an array of forum ids in which the searcher is allowed to view unapproved posts
757 * @param int &$topic_id is set to 0 or a topic id, if it is not 0 then only posts in this topic should be searched
758 * @param array &$author_ary an array of author ids
759 * @param array &$id_ary passed by reference, to be filled with ids for the page specified by $start and $per_page, should be ordered
760 * @param int $start indicates the first index of the page
761 * @param int $per_page number of ids each page is supposed to contain
762 * @return boolean|int total number of results
763 *
764 * @access public
765 */
766 function author_search($type, $firstpost_only, &$sort_by_sql, &$sort_key, &$sort_dir, &$sort_days, &$ex_fid_ary, &$m_approve_fid_ary, &$topic_id, &$author_ary, &$id_ary, $start, $per_page)
767 {
768 global $config, $db;
769
770 // No author? No posts.
771 if (!sizeof($author_ary))
772 {
773 return 0;
774 }
775
776 // generate a search_key from all the options to identify the results
777 $search_key = md5(implode('#', array(
778 '',
779 $type,
780 ($firstpost_only) ? 'firstpost' : '',
781 '',
782 '',
783 $sort_days,
784 $sort_key,
785 $topic_id,
786 implode(',', $ex_fid_ary),
787 implode(',', $m_approve_fid_ary),
788 implode(',', $author_ary)
789 )));
790
791 // try reading the results from cache
792 $total_results = 0;
793 if ($this->obtain_ids($search_key, $total_results, $id_ary, $start, $per_page, $sort_dir) == SEARCH_RESULT_IN_CACHE)
794 {
795 return $total_results;
796 }
797
798 $id_ary = array();
799
800 // Create some display specific sql strings
801 $sql_author = $db->sql_in_set('p.poster_id', $author_ary);
802 $sql_fora = (sizeof($ex_fid_ary)) ? ' AND ' . $db->sql_in_set('p.forum_id', $ex_fid_ary, true) : '';
803 $sql_time = ($sort_days) ? ' AND p.post_time >= ' . (time() - ($sort_days * 86400)) : '';
804 $sql_topic_id = ($topic_id) ? ' AND p.topic_id = ' . (int) $topic_id : '';
805 $sql_firstpost = ($firstpost_only) ? ' AND p.post_id = t.topic_first_post_id' : '';
806
807 // Build sql strings for sorting
808 $sql_sort = $sort_by_sql[$sort_key] . (($sort_dir == 'a') ? ' ASC' : ' DESC');
809 $sql_sort_table = $sql_sort_join = '';
810 switch ($sql_sort[0])
811 {
812 case 'u':
813 $sql_sort_table = USERS_TABLE . ' u, ';
814 $sql_sort_join = ' AND u.user_id = p.poster_id ';
815 break;
816
817 case 't':
818 $sql_sort_table = ($type == 'posts' && !$firstpost_only) ? TOPICS_TABLE . ' t, ' : '';
819 $sql_sort_join = ($type == 'posts' && !$firstpost_only) ? ' AND t.topic_id = p.topic_id ' : '';
820 break;
821
822 case 'f':
823 $sql_sort_table = FORUMS_TABLE . ' f, ';
824 $sql_sort_join = ' AND f.forum_id = p.forum_id ';
825 break;
826 }
827
828 if (!sizeof($m_approve_fid_ary))
829 {
830 $m_approve_fid_sql = ' AND p.post_approved = 1';
831 }
832 else if ($m_approve_fid_ary == array(-1))
833 {
834 $m_approve_fid_sql = '';
835 }
836 else
837 {
838 $m_approve_fid_sql = ' AND (p.post_approved = 1 OR ' . $db->sql_in_set('p.forum_id', $m_approve_fid_ary, true) . ')';
839 }
840
841 $select = ($type == 'posts') ? 'p.post_id' : 't.topic_id';
842 $is_mysql = false;
843
844 // If the cache was completely empty count the results
845 if (!$total_results)
846 {
847 switch ($db->sql_layer)
848 {
849 case 'mysql4':
850 case 'mysqli':
851 $select = 'SQL_CALC_FOUND_ROWS ' . $select;
852 $is_mysql = true;
853 break;
854
855 default:
856 if ($type == 'posts')
857 {
858 $sql = 'SELECT COUNT(p.post_id) as total_results
859 FROM ' . POSTS_TABLE . ' p' . (($firstpost_only) ? ', ' . TOPICS_TABLE . ' t ' : ' ') . "
860 WHERE $sql_author
861 $sql_topic_id
862 $sql_firstpost
863 $m_approve_fid_sql
864 $sql_fora
865 $sql_time";
866 }
867 else
868 {
869 if ($db->sql_layer == 'sqlite')
870 {
871 $sql = 'SELECT COUNT(topic_id) as total_results
872 FROM (SELECT DISTINCT t.topic_id';
873 }
874 else
875 {
876 $sql = 'SELECT COUNT(DISTINCT t.topic_id) as total_results';
877 }
878
879 $sql .= ' FROM ' . TOPICS_TABLE . ' t, ' . POSTS_TABLE . " p
880 WHERE $sql_author
881 $sql_topic_id
882 $sql_firstpost
883 $m_approve_fid_sql
884 $sql_fora
885 AND t.topic_id = p.topic_id
886 $sql_time" . (($db->sql_layer == 'sqlite') ? ')' : '');
887 }
888 $result = $db->sql_query($sql);
889
890 $total_results = (int) $db->sql_fetchfield('total_results');
891 $db->sql_freeresult($result);
892
893 if (!$total_results)
894 {
895 return false;
896 }
897 break;
898 }
899 }
900
901 // Build the query for really selecting the post_ids
902 if ($type == 'posts')
903 {
904 $sql = "SELECT $select
905 FROM " . $sql_sort_table . POSTS_TABLE . ' p' . (($firstpost_only) ? ', ' . TOPICS_TABLE . ' t' : '') . "
906 WHERE $sql_author
907 $sql_topic_id
908 $sql_firstpost
909 $m_approve_fid_sql
910 $sql_fora
911 $sql_sort_join
912 $sql_time
913 ORDER BY $sql_sort";
914 $field = 'post_id';
915 }
916 else
917 {
918 $sql = "SELECT $select
919 FROM " . $sql_sort_table . TOPICS_TABLE . ' t, ' . POSTS_TABLE . " p
920 WHERE $sql_author
921 $sql_topic_id
922 $sql_firstpost
923 $m_approve_fid_sql
924 $sql_fora
925 AND t.topic_id = p.topic_id
926 $sql_sort_join
927 $sql_time
928 GROUP BY t.topic_id, " . $sort_by_sql[$sort_key] . '
929 ORDER BY ' . $sql_sort;
930 $field = 'topic_id';
931 }
932
933 // Only read one block of posts from the db and then cache it
934 $result = $db->sql_query_limit($sql, $config['search_block_size'], $start);
935
936 while ($row = $db->sql_fetchrow($result))
937 {
938 $id_ary[] = $row[$field];
939 }
940 $db->sql_freeresult($result);
941
942 if (!$total_results && $is_mysql)
943 {
944 $sql = 'SELECT FOUND_ROWS() as total_results';
945 $result = $db->sql_query($sql);
946 $total_results = (int) $db->sql_fetchfield('total_results');
947 $db->sql_freeresult($result);
948
949 if (!$total_results)
950 {
951 return false;
952 }
953 }
954
955 if (sizeof($id_ary))
956 {
957 $this->save_ids($search_key, '', $author_ary, $total_results, $id_ary, $start, $sort_dir);
958 $id_ary = array_slice($id_ary, 0, $per_page);
959
960 return $total_results;
961 }
962 return false;
963 }
964
965 /**
966 * Split a text into words of a given length
967 *
968 * The text is converted to UTF-8, cleaned up, and split. Then, words that
969 * conform to the defined length range are returned in an array.
970 *
971 * NOTE: duplicates are NOT removed from the return array
972 *
973 * @param string $text Text to split, encoded in UTF-8
974 * @return array Array of UTF-8 words
975 *
976 * @access private
977 */
978 function split_message($text)
979 {
980 global $phpbb_root_path, $phpEx, $user;
981
982 $match = $words = array();
983
984 /**
985 * Taken from the original code
986 */
987 // Do not index code
988 $match[] = '#\[code(?:=.*?)?(\:?[0-9a-z]{5,})\].*?\[\/code(\:?[0-9a-z]{5,})\]#is';
989 // BBcode
990 $match[] = '#\[\/?[a-z0-9\*\+\-]+(?:=.*?)?(?::[a-z])?(\:?[0-9a-z]{5,})\]#';
991
992 $min = $this->word_length['min'];
993 $max = $this->word_length['max'];
994
995 $isset_min = $min - 1;
996
997 /**
998 * Clean up the string, remove HTML tags, remove BBCodes
999 */
1000 $word = strtok($this->cleanup(preg_replace($match, ' ', strip_tags($text)), -1), ' ');
1001
1002 while (strlen($word))
1003 {
1004 if (strlen($word) > 255 || strlen($word) <= $isset_min)
1005 {
1006 /**
1007 * Words longer than 255 bytes are ignored. This will have to be
1008 * changed whenever we change the length of search_wordlist.word_text
1009 *
1010 * Words shorter than $isset_min bytes are ignored, too
1011 */
1012 $word = strtok(' ');
1013 continue;
1014 }
1015
1016 $len = utf8_strlen($word);
1017
1018 /**
1019 * Test whether the word is too short to be indexed.
1020 *
1021 * Note that this limit does NOT apply to CJK and Hangul
1022 */
1023 if ($len < $min)
1024 {
1025 /**
1026 * Note: this could be optimized. If the codepoint is lower than Hangul's range
1027 * we know that it will also be lower than CJK ranges
1028 */
1029 if ((strncmp($word, UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, UTF8_HANGUL_LAST, 3) > 0)
1030 && (strncmp($word, UTF8_CJK_FIRST, 3) < 0 || strncmp($word, UTF8_CJK_LAST, 3) > 0)
1031 && (strncmp($word, UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, UTF8_CJK_B_LAST, 4) > 0))
1032 {
1033 $word = strtok(' ');
1034 continue;
1035 }
1036 }
1037
1038 $words[] = $word;
1039 $word = strtok(' ');
1040 }
1041
1042 return $words;
1043 }
1044
1045 /**
1046 * Updates wordlist and wordmatch tables when a message is posted or changed
1047 *
1048 * @param string $mode Contains the post mode: edit, post, reply, quote
1049 * @param int $post_id The id of the post which is modified/created
1050 * @param string &$message New or updated post content
1051 * @param string &$subject New or updated post subject
1052 * @param int $poster_id Post author's user id
1053 * @param int $forum_id The id of the forum in which the post is located
1054 *
1055 * @access public
1056 */
1057 function index($mode, $post_id, &$message, &$subject, $poster_id, $forum_id)
1058 {
1059 global $config, $db, $user;
1060
1061 if (!$config['fulltext_native_load_upd'])
1062 {
1063 /**
1064 * The search indexer is disabled, return
1065 */
1066 return;
1067 }
1068
1069 // Split old and new post/subject to obtain array of 'words'
1070 $split_text = $this->split_message($message);
1071 $split_title = $this->split_message($subject);
1072
1073 $cur_words = array('post' => array(), 'title' => array());
1074
1075 $words = array();
1076 if ($mode == 'edit')
1077 {
1078 $words['add']['post'] = array();
1079 $words['add']['title'] = array();
1080 $words['del']['post'] = array();
1081 $words['del']['title'] = array();
1082
1083 $sql = 'SELECT w.word_id, w.word_text, m.title_match
1084 FROM ' . SEARCH_WORDLIST_TABLE . ' w, ' . SEARCH_WORDMATCH_TABLE . " m
1085 WHERE m.post_id = $post_id
1086 AND w.word_id = m.word_id";
1087 $result = $db->sql_query($sql);
1088
1089 while ($row = $db->sql_fetchrow($result))
1090 {
1091 $which = ($row['title_match']) ? 'title' : 'post';
1092 $cur_words[$which][$row['word_text']] = $row['word_id'];
1093 }
1094 $db->sql_freeresult($result);
1095
1096 $words['add']['post'] = array_diff($split_text, array_keys($cur_words['post']));
1097 $words['add']['title'] = array_diff($split_title, array_keys($cur_words['title']));
1098 $words['del']['post'] = array_diff(array_keys($cur_words['post']), $split_text);
1099 $words['del']['title'] = array_diff(array_keys($cur_words['title']), $split_title);
1100 }
1101 else
1102 {
1103 $words['add']['post'] = $split_text;
1104 $words['add']['title'] = $split_title;
1105 $words['del']['post'] = array();
1106 $words['del']['title'] = array();
1107 }
1108 unset($split_text);
1109 unset($split_title);
1110
1111 // Get unique words from the above arrays
1112 $unique_add_words = array_unique(array_merge($words['add']['post'], $words['add']['title']));
1113
1114 // We now have unique arrays of all words to be added and removed and
1115 // individual arrays of added and removed words for text and title. What
1116 // we need to do now is add the new words (if they don't already exist)
1117 // and then add (or remove) matches between the words and this post
1118 if (sizeof($unique_add_words))
1119 {
1120 $sql = 'SELECT word_id, word_text
1121 FROM ' . SEARCH_WORDLIST_TABLE . '
1122 WHERE ' . $db->sql_in_set('word_text', $unique_add_words);
1123 $result = $db->sql_query($sql);
1124
1125 $word_ids = array();
1126 while ($row = $db->sql_fetchrow($result))
1127 {
1128 $word_ids[$row['word_text']] = $row['word_id'];
1129 }
1130 $db->sql_freeresult($result);
1131 $new_words = array_diff($unique_add_words, array_keys($word_ids));
1132
1133 $db->sql_transaction('begin');
1134 if (sizeof($new_words))
1135 {
1136 $sql_ary = array();
1137
1138 foreach ($new_words as $word)
1139 {
1140 $sql_ary[] = array('word_text' => (string) $word, 'word_count' => 0);
1141 }
1142 $db->sql_return_on_error(true);
1143 $db->sql_multi_insert(SEARCH_WORDLIST_TABLE, $sql_ary);
1144 $db->sql_return_on_error(false);
1145 }
1146 unset($new_words, $sql_ary);
1147 }
1148 else
1149 {
1150 $db->sql_transaction('begin');
1151 }
1152
1153 // now update the search match table, remove links to removed words and add links to new words
1154 foreach ($words['del'] as $word_in => $word_ary)
1155 {
1156 $title_match = ($word_in == 'title') ? 1 : 0;
1157
1158 if (sizeof($word_ary))
1159 {
1160 $sql_in = array();
1161 foreach ($word_ary as $word)
1162 {
1163 $sql_in[] = $cur_words[$word_in][$word];
1164 }
1165
1166 $sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
1167 WHERE ' . $db->sql_in_set('word_id', $sql_in) . '
1168 AND post_id = ' . intval($post_id) . "
1169 AND title_match = $title_match";
1170 $db->sql_query($sql);
1171
1172 $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
1173 SET word_count = word_count - 1
1174 WHERE ' . $db->sql_in_set('word_id', $sql_in) . '
1175 AND word_count > 0';
1176 $db->sql_query($sql);
1177
1178 unset($sql_in);
1179 }
1180 }
1181
1182 $db->sql_return_on_error(true);
1183 foreach ($words['add'] as $word_in => $word_ary)
1184 {
1185 $title_match = ($word_in == 'title') ? 1 : 0;
1186
1187 if (sizeof($word_ary))
1188 {
1189 $sql = 'INSERT INTO ' . SEARCH_WORDMATCH_TABLE . ' (post_id, word_id, title_match)
1190 SELECT ' . (int) $post_id . ', word_id, ' . (int) $title_match . '
1191 FROM ' . SEARCH_WORDLIST_TABLE . '
1192 WHERE ' . $db->sql_in_set('word_text', $word_ary);
1193 $db->sql_query($sql);
1194
1195 $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
1196 SET word_count = word_count + 1
1197 WHERE ' . $db->sql_in_set('word_text', $word_ary);
1198 $db->sql_query($sql);
1199 }
1200 }
1201 $db->sql_return_on_error(false);
1202
1203 $db->sql_transaction('commit');
1204
1205 // destroy cached search results containing any of the words removed or added
1206 $this->destroy_cache(array_unique(array_merge($words['add']['post'], $words['add']['title'], $words['del']['post'], $words['del']['title'])), array($poster_id));
1207
1208 unset($unique_add_words);
1209 unset($words);
1210 unset($cur_words);
1211 }
1212
1213 /**
1214 * Removes entries from the wordmatch table for the specified post_ids
1215 */
1216 function index_remove($post_ids, $author_ids, $forum_ids)
1217 {
1218 global $db;
1219
1220 if (sizeof($post_ids))
1221 {
1222 $sql = 'SELECT w.word_id, w.word_text, m.title_match
1223 FROM ' . SEARCH_WORDMATCH_TABLE . ' m, ' . SEARCH_WORDLIST_TABLE . ' w
1224 WHERE ' . $db->sql_in_set('m.post_id', $post_ids) . '
1225 AND w.word_id = m.word_id';
1226 $result = $db->sql_query($sql);
1227
1228 $message_word_ids = $title_word_ids = $word_texts = array();
1229 while ($row = $db->sql_fetchrow($result))
1230 {
1231 if ($row['title_match'])
1232 {
1233 $title_word_ids[] = $row['word_id'];
1234 }
1235 else
1236 {
1237 $message_word_ids[] = $row['word_id'];
1238 }
1239 $word_texts[] = $row['word_text'];
1240 }
1241 $db->sql_freeresult($result);
1242
1243 if (sizeof($title_word_ids))
1244 {
1245 $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
1246 SET word_count = word_count - 1
1247 WHERE ' . $db->sql_in_set('word_id', $title_word_ids) . '
1248 AND word_count > 0';
1249 $db->sql_query($sql);
1250 }
1251
1252 if (sizeof($message_word_ids))
1253 {
1254 $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
1255 SET word_count = word_count - 1
1256 WHERE ' . $db->sql_in_set('word_id', $message_word_ids) . '
1257 AND word_count > 0';
1258 $db->sql_query($sql);
1259 }
1260
1261 unset($title_word_ids);
1262 unset($message_word_ids);
1263
1264 $sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
1265 WHERE ' . $db->sql_in_set('post_id', $post_ids);
1266 $db->sql_query($sql);
1267 }
1268
1269 $this->destroy_cache(array_unique($word_texts), $author_ids);
1270 }
1271
1272 /**
1273 * Tidy up indexes: Tag 'common words' and remove
1274 * words no longer referenced in the match table
1275 */
1276 function tidy()
1277 {
1278 global $db, $config;
1279
1280 // Is the fulltext indexer disabled? If yes then we need not
1281 // carry on ... it's okay ... I know when I'm not wanted boo hoo
1282 if (!$config['fulltext_native_load_upd'])
1283 {
1284 set_config('search_last_gc', time(), true);
1285 return;
1286 }
1287
1288 $destroy_cache_words = array();
1289
1290 // Remove common words
1291 if ($config['num_posts'] >= 100 && $config['fulltext_native_common_thres'])
1292 {
1293 $common_threshold = ((double) $config['fulltext_native_common_thres']) / 100.0;
1294 // First, get the IDs of common words
1295 $sql = 'SELECT word_id, word_text
1296 FROM ' . SEARCH_WORDLIST_TABLE . '
1297 WHERE word_count > ' . floor($config['num_posts'] * $common_threshold) . '
1298 OR word_common = 1';
1299 $result = $db->sql_query($sql);
1300
1301 $sql_in = array();
1302 while ($row = $db->sql_fetchrow($result))
1303 {
1304 $sql_in[] = $row['word_id'];
1305 $destroy_cache_words[] = $row['word_text'];
1306 }
1307 $db->sql_freeresult($result);
1308
1309 if (sizeof($sql_in))
1310 {
1311 // Flag the words
1312 $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
1313 SET word_common = 1
1314 WHERE ' . $db->sql_in_set('word_id', $sql_in);
1315 $db->sql_query($sql);
1316
1317 // by setting search_last_gc to the new time here we make sure that if a user reloads because the
1318 // following query takes too long, he won't run into it again
1319 set_config('search_last_gc', time(), true);
1320
1321 // Delete the matches
1322 $sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
1323 WHERE ' . $db->sql_in_set('word_id', $sql_in);
1324 $db->sql_query($sql);
1325 }
1326 unset($sql_in);
1327 }
1328
1329 if (sizeof($destroy_cache_words))
1330 {
1331 // destroy cached search results containing any of the words that are now common or were removed
1332 $this->destroy_cache(array_unique($destroy_cache_words));
1333 }
1334
1335 set_config('search_last_gc', time(), true);
1336 }
1337
1338 /**
1339 * Deletes all words from the index
1340 */
1341 function delete_index($acp_module, $u_action)
1342 {
1343 global $db;
1344
1345 switch ($db->sql_layer)
1346 {
1347 case 'sqlite':
1348 case 'firebird':
1349 $db->sql_query('DELETE FROM ' . SEARCH_WORDLIST_TABLE);
1350 $db->sql_query('DELETE FROM ' . SEARCH_WORDMATCH_TABLE);
1351 $db->sql_query('DELETE FROM ' . SEARCH_RESULTS_TABLE);
1352 break;
1353
1354 default:
1355 $db->sql_query('TRUNCATE TABLE ' . SEARCH_WORDLIST_TABLE);
1356 $db->sql_query('TRUNCATE TABLE ' . SEARCH_WORDMATCH_TABLE);
1357 $db->sql_query('TRUNCATE TABLE ' . SEARCH_RESULTS_TABLE);
1358 break;
1359 }
1360 }
1361
1362 /**
1363 * Returns true if both FULLTEXT indexes exist
1364 */
1365 function index_created()
1366 {
1367 if (!sizeof($this->stats))
1368 {
1369 $this->get_stats();
1370 }
1371
1372 return ($this->stats['total_words'] && $this->stats['total_matches']) ? true : false;
1373 }
1374
1375 /**
1376 * Returns an associative array containing information about the indexes
1377 */
1378 function index_stats()
1379 {
1380 global $user;
1381
1382 if (!sizeof($this->stats))
1383 {
1384 $this->get_stats();
1385 }
1386
1387 return array(
1388 $user->lang['TOTAL_WORDS'] => $this->stats['total_words'],
1389 $user->lang['TOTAL_MATCHES'] => $this->stats['total_matches']);
1390 }
1391
1392 function get_stats()
1393 {
1394 global $db;
1395
1396 $sql = 'SELECT COUNT(*) as total_words
1397 FROM ' . SEARCH_WORDLIST_TABLE;
1398 $result = $db->sql_query($sql);
1399 $this->stats['total_words'] = (int) $db->sql_fetchfield('total_words');
1400 $db->sql_freeresult($result);
1401
1402 $sql = 'SELECT COUNT(*) as total_matches
1403 FROM ' . SEARCH_WORDMATCH_TABLE;
1404 $result = $db->sql_query($sql);
1405 $this->stats['total_matches'] = (int) $db->sql_fetchfield('total_matches');
1406 $db->sql_freeresult($result);
1407 }
1408
1409 /**
1410 * Clean up a text to remove non-alphanumeric characters
1411 *
1412 * This method receives a UTF-8 string, normalizes and validates it, replaces all
1413 * non-alphanumeric characters with strings then returns the result.
1414 *
1415 * Any number of "allowed chars" can be passed as a UTF-8 string in NFC.
1416 *
1417 * @param string $text Text to split, in UTF-8 (not normalized or sanitized)
1418 * @param string $allowed_chars String of special chars to allow
1419 * @param string $encoding Text encoding
1420 * @return string Cleaned up text, only alphanumeric chars are left
1421 *
1422 * @todo normalizer::cleanup being able to be used?
1423 */
1424 function cleanup($text, $allowed_chars = null, $encoding = 'utf-8')
1425 {
1426 global $phpbb_root_path, $phpEx;
1427 static $conv = array(), $conv_loaded = array();
1428 $words = $allow = array();
1429
1430 // Convert the text to UTF-8
1431 $encoding = strtolower($encoding);
1432 if ($encoding != 'utf-8')
1433 {
1434 $text = utf8_recode($text, $encoding);
1435 }
1436
1437 $utf_len_mask = array(
1438 "\xC0" => 2,
1439 "\xD0" => 2,
1440 "\xE0" => 3,
1441 "\xF0" => 4
1442 );
1443
1444 /**
1445 * Replace HTML entities and NCRs
1446 */
1447 $text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES);
1448
1449 /**
1450 * Load the UTF-8 normalizer
1451 *
1452 * If we use it more widely, an instance of that class should be held in a
1453 * a global variable instead
1454 */
1455 utf_normalizer::nfc($text);
1456
1457 /**
1458 * The first thing we do is:
1459 *
1460 * - convert ASCII-7 letters to lowercase
1461 * - remove the ASCII-7 non-alpha characters
1462 * - remove the bytes that should not appear in a valid UTF-8 string: 0xC0,
1463 * 0xC1 and 0xF5-0xFF
1464 *
1465 * @todo in theory, the third one is already taken care of during normalization and those chars should have been replaced by Unicode replacement chars
1466 */
1467 $sb_match = "ISTCPAMELRDOJBNHFGVWUQKYXZ\r\n\t!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\xC0\xC1\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF";
1468 $sb_replace = 'istcpamelrdojbnhfgvwuqkyxz ';
1469
1470 /**
1471 * This is the list of legal ASCII chars, it is automatically extended
1472 * with ASCII chars from $allowed_chars
1473 */
1474 $legal_ascii = ' eaisntroludcpmghbfvq10xy2j9kw354867z';
1475
1476 /**
1477 * Prepare an array containing the extra chars to allow
1478 */
1479 if (isset($allowed_chars[0]))
1480 {
1481 $pos = 0;
1482 $len = strlen($allowed_chars);
1483 do
1484 {
1485 $c = $allowed_chars[$pos];
1486
1487 if ($c < "\x80")
1488 {
1489 /**
1490 * ASCII char
1491 */
1492 $sb_pos = strpos($sb_match, $c);
1493 if (is_int($sb_pos))
1494 {
1495 /**
1496 * Remove the char from $sb_match and its corresponding
1497 * replacement in $sb_replace
1498 */
1499 $sb_match = substr($sb_match, 0, $sb_pos) . substr($sb_match, $sb_pos + 1);
1500 $sb_replace = substr($sb_replace, 0, $sb_pos) . substr($sb_replace, $sb_pos + 1);
1501 $legal_ascii .= $c;
1502 }
1503
1504 ++$pos;
1505 }
1506 else
1507 {
1508 /**
1509 * UTF-8 char
1510 */
1511 $utf_len = $utf_len_mask[$c & "\xF0"];
1512 $allow[substr($allowed_chars, $pos, $utf_len)] = 1;
1513 $pos += $utf_len;
1514 }
1515 }
1516 while ($pos < $len);
1517 }
1518
1519 $text = strtr($text, $sb_match, $sb_replace);
1520 $ret = '';
1521
1522 $pos = 0;
1523 $len = strlen($text);
1524
1525 do
1526 {
1527 /**
1528 * Do all consecutive ASCII chars at once
1529 */
1530 if ($spn = strspn($text, $legal_ascii, $pos))
1531 {
1532 $ret .= substr($text, $pos, $spn);
1533 $pos += $spn;
1534 }
1535
1536 if ($pos >= $len)
1537 {
1538 return $ret;
1539 }
1540
1541 /**
1542 * Capture the UTF char
1543 */
1544 $utf_len = $utf_len_mask[$text[$pos] & "\xF0"];
1545 $utf_char = substr($text, $pos, $utf_len);
1546 $pos += $utf_len;
1547
1548 if (($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
1549 || ($utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST)
1550 || ($utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST))
1551 {
1552 /**
1553 * All characters within these ranges are valid
1554 *
1555 * We separate them with a space in order to index each character
1556 * individually
1557 */
1558 $ret .= ' ' . $utf_char . ' ';
1559 continue;
1560 }
1561
1562 if (isset($allow[$utf_char]))
1563 {
1564 /**
1565 * The char is explicitly allowed
1566 */
1567 $ret .= $utf_char;
1568 continue;
1569 }
1570
1571 if (isset($conv[$utf_char]))
1572 {
1573 /**
1574 * The char is mapped to something, maybe to itself actually
1575 */
1576 $ret .= $conv[$utf_char];
1577 continue;
1578 }
1579
1580 /**
1581 * The char isn't mapped, but did we load its conversion table?
1582 *
1583 * The search indexer table is split into blocks. The block number of
1584 * each char is equal to its codepoint right-shifted for 11 bits. It
1585 * means that out of the 11, 16 or 21 meaningful bits of a 2-, 3- or
1586 * 4- byte sequence we only keep the leftmost 0, 5 or 10 bits. Thus,
1587 * all UTF chars encoded in 2 bytes are in the same first block.
1588 */
1589 if (isset($utf_char[2]))
1590 {
1591 if (isset($utf_char[3]))
1592 {
1593 /**
1594 * 1111 0nnn 10nn nnnn 10nx xxxx 10xx xxxx
1595 * 0000 0111 0011 1111 0010 0000
1596 */
1597 $idx = ((ord($utf_char[0]) & 0x07) << 7) | ((ord($utf_char[1]) & 0x3F) << 1) | ((ord($utf_char[2]) & 0x20) >> 5);
1598 }
1599 else
1600 {
1601 /**
1602 * 1110 nnnn 10nx xxxx 10xx xxxx
1603 * 0000 0111 0010 0000
1604 */
1605 $idx = ((ord($utf_char[0]) & 0x07) << 1) | ((ord($utf_char[1]) & 0x20) >> 5);
1606 }
1607 }
1608 else
1609 {
1610 /**
1611 * 110x xxxx 10xx xxxx
1612 * 0000 0000 0000 0000
1613 */
1614 $idx = 0;
1615 }
1616
1617 /**
1618 * Check if the required conv table has been loaded already
1619 */
1620 if (!isset($conv_loaded[$idx]))
1621 {
1622 $conv_loaded[$idx] = 1;
1623 $file = $phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $phpEx;
1624
1625 if (file_exists($file))
1626 {
1627 $conv += include($file);
1628 }
1629 }
1630
1631 if (isset($conv[$utf_char]))
1632 {
1633 $ret .= $conv[$utf_char];
1634 }
1635 else
1636 {
1637 /**
1638 * We add an entry to the conversion table so that we
1639 * don't have to convert to codepoint and perform the checks
1640 * that are above this block
1641 */
1642 $conv[$utf_char] = ' ';
1643 $ret .= ' ';
1644 }
1645 }
1646 while (1);
1647
1648 return $ret;
1649 }
1650
1651 /**
1652 * Returns a list of options for the ACP to display
1653 */
1654 function acp()
1655 {
1656 global $user, $config;
1657
1658
1659 /**
1660 * if we need any options, copied from fulltext_native for now, will have to be adjusted or removed
1661 */
1662
1663 $tpl = '
1664 <dl>
1665 <dt><label for="fulltext_native_load_upd">' . $user->lang['YES_SEARCH_UPDATE'] . ':</label><br /><span>' . $user->lang['YES_SEARCH_UPDATE_EXPLAIN'] . '</span></dt>
1666 <dd><label><input type="radio" id="fulltext_native_load_upd" name="config[fulltext_native_load_upd]" value="1"' . (($config['fulltext_native_load_upd']) ? ' checked="checked"' : '') . ' class="radio" /> ' . $user->lang['YES'] . '</label><label><input type="radio" name="config[fulltext_native_load_upd]" value="0"' . ((!$config['fulltext_native_load_upd']) ? ' checked="checked"' : '') . ' class="radio" /> ' . $user->lang['NO'] . '</label></dd>
1667 </dl>
1668 <dl>
1669 <dt><label for="fulltext_native_min_chars">' . $user->lang['MIN_SEARCH_CHARS'] . ':</label><br /><span>' . $user->lang['MIN_SEARCH_CHARS_EXPLAIN'] . '</span></dt>
1670 <dd><input id="fulltext_native_min_chars" type="text" size="3" maxlength="3" name="config[fulltext_native_min_chars]" value="' . (int) $config['fulltext_native_min_chars'] . '" /></dd>
1671 </dl>
1672 <dl>
1673 <dt><label for="fulltext_native_max_chars">' . $user->lang['MAX_SEARCH_CHARS'] . ':</label><br /><span>' . $user->lang['MAX_SEARCH_CHARS_EXPLAIN'] . '</span></dt>
1674 <dd><input id="fulltext_native_max_chars" type="text" size="3" maxlength="3" name="config[fulltext_native_max_chars]" value="' . (int) $config['fulltext_native_max_chars'] . '" /></dd>
1675 </dl>
1676 <dl>
1677 <dt><label for="fulltext_native_common_thres">' . $user->lang['COMMON_WORD_THRESHOLD'] . ':</label><br /><span>' . $user->lang['COMMON_WORD_THRESHOLD_EXPLAIN'] . '</span></dt>
1678 <dd><input id="fulltext_native_common_thres" type="text" size="3" maxlength="3" name="config[fulltext_native_common_thres]" value="' . (double) $config['fulltext_native_common_thres'] . '" /> %</dd>
1679 </dl>
1680 ';
1681
1682 // These are fields required in the config table
1683 return array(
1684 'tpl' => $tpl,
1685 'config' => array('fulltext_native_load_upd' => 'bool', 'fulltext_native_min_chars' => 'integer:0:255', 'fulltext_native_max_chars' => 'integer:0:255', 'fulltext_native_common_thres' => 'double:0:100')
1686 );
1687 }
1688}
1689
1690?>
Note: See TracBrowser for help on using the repository browser.