source: trunk/forum/includes/search/fulltext_native.php

Last change on this file was 702, checked in by george, 15 years ago
  • Upraveno: Aktualizace fóra.
File size: 47.7 KB
Line 
1<?php
2/**
3*
4* @package search
5* @version $Id$
6* @copyright (c) 2005 phpBB Group
7* @license http://opensource.org/licenses/gpl-license.php GNU Public License
8*
9*/
10
11/**
12* @ignore
13*/
14if (!defined('IN_PHPBB'))
15{
16 exit;
17}
18
19/**
20* @ignore
21*/
22include_once($phpbb_root_path . 'includes/search/search.' . $phpEx);
23
24/**
25* fulltext_native
26* phpBB's own db driven fulltext search, version 2
27* @package search
28*/
29class fulltext_native extends search_backend
30{
31 var $stats = array();
32 var $word_length = array();
33 var $search_query;
34 var $common_words = array();
35
36 var $must_contain_ids = array();
37 var $must_not_contain_ids = array();
38 var $must_exclude_one_ids = array();
39
40 /**
41 * Initialises the fulltext_native search backend with min/max word length and makes sure the UTF-8 normalizer is loaded.
42 *
43 * @param boolean|string &$error is passed by reference and should either be set to false on success or an error message on failure.
44 *
45 * @access public
46 */
47 function fulltext_native(&$error)
48 {
49 global $phpbb_root_path, $phpEx, $config;
50
51 $this->word_length = array('min' => $config['fulltext_native_min_chars'], 'max' => $config['fulltext_native_max_chars']);
52
53 /**
54 * Load the UTF tools
55 */
56 if (!class_exists('utf_normalizer'))
57 {
58 include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
59 }
60
61
62 $error = false;
63 }
64
65 /**
66 * This function fills $this->search_query with the cleaned user search query.
67 *
68 * If $terms is 'any' then the words will be extracted from the search query
69 * and combined with | inside brackets. They will afterwards be treated like
70 * an standard search query.
71 *
72 * Then it analyses the query and fills the internal arrays $must_not_contain_ids,
73 * $must_contain_ids and $must_exclude_one_ids which are later used by keyword_search().
74 *
75 * @param string $keywords contains the search query string as entered by the user
76 * @param string $terms is either 'all' (use search query as entered, default words to 'must be contained in post')
77 * or 'any' (find all posts containing at least one of the given words)
78 * @return boolean false if no valid keywords were found and otherwise true
79 *
80 * @access public
81 */
82 function split_keywords($keywords, $terms)
83 {
84 global $db, $user, $config;
85
86 $keywords = trim($this->cleanup($keywords, '+-|()*'));
87
88 // allow word|word|word without brackets
89 if ((strpos($keywords, ' ') === false) && (strpos($keywords, '|') !== false) && (strpos($keywords, '(') === false))
90 {
91 $keywords = '(' . $keywords . ')';
92 }
93
94 $open_bracket = $space = false;
95 for ($i = 0, $n = strlen($keywords); $i < $n; $i++)
96 {
97 if ($open_bracket !== false)
98 {
99 switch ($keywords[$i])
100 {
101 case ')':
102 if ($open_bracket + 1 == $i)
103 {
104 $keywords[$i - 1] = '|';
105 $keywords[$i] = '|';
106 }
107 $open_bracket = false;
108 break;
109 case '(':
110 $keywords[$i] = '|';
111 break;
112 case '+':
113 case '-':
114 case ' ':
115 $keywords[$i] = '|';
116 break;
117 }
118 }
119 else
120 {
121 switch ($keywords[$i])
122 {
123 case ')':
124 $keywords[$i] = ' ';
125 break;
126 case '(':
127 $open_bracket = $i;
128 $space = false;
129 break;
130 case '|':
131 $keywords[$i] = ' ';
132 break;
133 case '-':
134 case '+':
135 $space = $keywords[$i];
136 break;
137 case ' ':
138 if ($space !== false)
139 {
140 $keywords[$i] = $space;
141 }
142 break;
143 default:
144 $space = false;
145 }
146 }
147 }
148
149 if ($open_bracket)
150 {
151 $keywords .= ')';
152 }
153
154 $match = array(
155 '# +#',
156 '#\|\|+#',
157 '#(\+|\-)(?:\+|\-)+#',
158 '#\(\|#',
159 '#\|\)#',
160 );
161 $replace = array(
162 ' ',
163 '|',
164 '$1',
165 '(',
166 ')',
167 );
168
169 $keywords = preg_replace($match, $replace, $keywords);
170 $num_keywords = sizeof(explode(' ', $keywords));
171
172 // We limit the number of allowed keywords to minimize load on the database
173 if ($config['max_num_search_keywords'] && $num_keywords > $config['max_num_search_keywords'])
174 {
175 trigger_error($user->lang('MAX_NUM_SEARCH_KEYWORDS_REFINE', $config['max_num_search_keywords'], $num_keywords));
176 }
177
178 // $keywords input format: each word separated by a space, words in a bracket are not separated
179
180 // the user wants to search for any word, convert the search query
181 if ($terms == 'any')
182 {
183 $words = array();
184
185 preg_match_all('#([^\\s+\\-|()]+)(?:$|[\\s+\\-|()])#u', $keywords, $words);
186 if (sizeof($words[1]))
187 {
188 $keywords = '(' . implode('|', $words[1]) . ')';
189 }
190 }
191
192 // set the search_query which is shown to the user
193 $this->search_query = $keywords;
194
195 $exact_words = array();
196 preg_match_all('#([^\\s+\\-|*()]+)(?:$|[\\s+\\-|()])#u', $keywords, $exact_words);
197 $exact_words = $exact_words[1];
198
199 $common_ids = $words = array();
200
201 if (sizeof($exact_words))
202 {
203 $sql = 'SELECT word_id, word_text, word_common
204 FROM ' . SEARCH_WORDLIST_TABLE . '
205 WHERE ' . $db->sql_in_set('word_text', $exact_words) . '
206 ORDER BY word_count ASC';
207 $result = $db->sql_query($sql);
208
209 // store an array of words and ids, remove common words
210 while ($row = $db->sql_fetchrow($result))
211 {
212 if ($row['word_common'])
213 {
214 $this->common_words[] = $row['word_text'];
215 $common_ids[$row['word_text']] = (int) $row['word_id'];
216 continue;
217 }
218
219 $words[$row['word_text']] = (int) $row['word_id'];
220 }
221 $db->sql_freeresult($result);
222 }
223 unset($exact_words);
224
225 // now analyse the search query, first split it using the spaces
226 $query = explode(' ', $keywords);
227
228 $this->must_contain_ids = array();
229 $this->must_not_contain_ids = array();
230 $this->must_exclude_one_ids = array();
231
232 $mode = '';
233 $ignore_no_id = true;
234
235 foreach ($query as $word)
236 {
237 if (empty($word))
238 {
239 continue;
240 }
241
242 // words which should not be included
243 if ($word[0] == '-')
244 {
245 $word = substr($word, 1);
246
247 // a group of which at least one may not be in the resulting posts
248 if ($word[0] == '(')
249 {
250 $word = array_unique(explode('|', substr($word, 1, -1)));
251 $mode = 'must_exclude_one';
252 }
253 // one word which should not be in the resulting posts
254 else
255 {
256 $mode = 'must_not_contain';
257 }
258 $ignore_no_id = true;
259 }
260 // words which have to be included
261 else
262 {
263 // no prefix is the same as a +prefix
264 if ($word[0] == '+')
265 {
266 $word = substr($word, 1);
267 }
268
269 // a group of words of which at least one word should be in every resulting post
270 if ($word[0] == '(')
271 {
272 $word = array_unique(explode('|', substr($word, 1, -1)));
273 }
274 $ignore_no_id = false;
275 $mode = 'must_contain';
276 }
277
278 if (empty($word))
279 {
280 continue;
281 }
282
283 // if this is an array of words then retrieve an id for each
284 if (is_array($word))
285 {
286 $non_common_words = array();
287 $id_words = array();
288 foreach ($word as $i => $word_part)
289 {
290 if (strpos($word_part, '*') !== false)
291 {
292 $id_words[] = '\'' . $db->sql_escape(str_replace('*', '%', $word_part)) . '\'';
293 $non_common_words[] = $word_part;
294 }
295 else if (isset($words[$word_part]))
296 {
297 $id_words[] = $words[$word_part];
298 $non_common_words[] = $word_part;
299 }
300 else
301 {
302 $len = utf8_strlen($word_part);
303 if ($len < $this->word_length['min'] || $len > $this->word_length['max'])
304 {
305 $this->common_words[] = $word_part;
306 }
307 }
308 }
309 if (sizeof($id_words))
310 {
311 sort($id_words);
312 if (sizeof($id_words) > 1)
313 {
314 $this->{$mode . '_ids'}[] = $id_words;
315 }
316 else
317 {
318 $mode = ($mode == 'must_exclude_one') ? 'must_not_contain' : $mode;
319 $this->{$mode . '_ids'}[] = $id_words[0];
320 }
321 }
322 // throw an error if we shall not ignore unexistant words
323 else if (!$ignore_no_id && sizeof($non_common_words))
324 {
325 trigger_error(sprintf($user->lang['WORDS_IN_NO_POST'], implode(', ', $non_common_words)));
326 }
327 unset($non_common_words);
328 }
329 // else we only need one id
330 else if (($wildcard = strpos($word, '*') !== false) || isset($words[$word]))
331 {
332 if ($wildcard)
333 {
334 $len = utf8_strlen(str_replace('*', '', $word));
335 if ($len >= $this->word_length['min'] && $len <= $this->word_length['max'])
336 {
337 $this->{$mode . '_ids'}[] = '\'' . $db->sql_escape(str_replace('*', '%', $word)) . '\'';
338 }
339 else
340 {
341 $this->common_words[] = $word;
342 }
343 }
344 else
345 {
346 $this->{$mode . '_ids'}[] = $words[$word];
347 }
348 }
349 // throw an error if we shall not ignore unexistant words
350 else if (!$ignore_no_id)
351 {
352 if (!isset($common_ids[$word]))
353 {
354 $len = utf8_strlen($word);
355 if ($len >= $this->word_length['min'] && $len <= $this->word_length['max'])
356 {
357 trigger_error(sprintf($user->lang['WORD_IN_NO_POST'], $word));
358 }
359 else
360 {
361 $this->common_words[] = $word;
362 }
363 }
364 }
365 else
366 {
367 $len = utf8_strlen($word);
368 if ($len < $this->word_length['min'] || $len > $this->word_length['max'])
369 {
370 $this->common_words[] = $word;
371 }
372 }
373 }
374
375 // we can't search for negatives only
376 if (!sizeof($this->must_contain_ids))
377 {
378 return false;
379 }
380
381 if (!empty($this->search_query))
382 {
383 return true;
384 }
385 return false;
386 }
387
388 /**
389 * Performs a search on keywords depending on display specific params. You have to run split_keywords() first.
390 *
391 * @param string $type contains either posts or topics depending on what should be searched for
392 * @param string $fields contains either titleonly (topic titles should be searched), msgonly (only message bodies should be searched), firstpost (only subject and body of the first post should be searched) or all (all post bodies and subjects should be searched)
393 * @param string $terms is either 'all' (use query as entered, words without prefix should default to "have to be in field") or 'any' (ignore search query parts and just return all posts that contain any of the specified words)
394 * @param array $sort_by_sql contains SQL code for the ORDER BY part of a query
395 * @param string $sort_key is the key of $sort_by_sql for the selected sorting
396 * @param string $sort_dir is either a or d representing ASC and DESC
397 * @param string $sort_days specifies the maximum amount of days a post may be old
398 * @param array $ex_fid_ary specifies an array of forum ids which should not be searched
399 * @param array $m_approve_fid_ary specifies an array of forum ids in which the searcher is allowed to view unapproved posts
400 * @param int $topic_id is set to 0 or a topic id, if it is not 0 then only posts in this topic should be searched
401 * @param array $author_ary an array of author ids if the author should be ignored during the search the array is empty
402 * @param string $author_name specifies the author match, when ANONYMOUS is also a search-match
403 * @param array &$id_ary passed by reference, to be filled with ids for the page specified by $start and $per_page, should be ordered
404 * @param int $start indicates the first index of the page
405 * @param int $per_page number of ids each page is supposed to contain
406 * @return boolean|int total number of results
407 *
408 * @access public
409 */
410 function keyword_search($type, $fields, $terms, $sort_by_sql, $sort_key, $sort_dir, $sort_days, $ex_fid_ary, $m_approve_fid_ary, $topic_id, $author_ary, $author_name, &$id_ary, $start, $per_page)
411 {
412 global $config, $db;
413
414 // No keywords? No posts.
415 if (empty($this->search_query))
416 {
417 return false;
418 }
419
420 $must_contain_ids = $this->must_contain_ids;
421 $must_not_contain_ids = $this->must_not_contain_ids;
422 $must_exclude_one_ids = $this->must_exclude_one_ids;
423
424 sort($must_contain_ids);
425 sort($must_not_contain_ids);
426 sort($must_exclude_one_ids);
427
428 // generate a search_key from all the options to identify the results
429 $search_key = md5(implode('#', array(
430 serialize($must_contain_ids),
431 serialize($must_not_contain_ids),
432 serialize($must_exclude_one_ids),
433 $type,
434 $fields,
435 $terms,
436 $sort_days,
437 $sort_key,
438 $topic_id,
439 implode(',', $ex_fid_ary),
440 implode(',', $m_approve_fid_ary),
441 implode(',', $author_ary),
442 $author_name,
443 )));
444
445 // try reading the results from cache
446 $total_results = 0;
447 if ($this->obtain_ids($search_key, $total_results, $id_ary, $start, $per_page, $sort_dir) == SEARCH_RESULT_IN_CACHE)
448 {
449 return $total_results;
450 }
451
452 $id_ary = array();
453
454 $sql_where = array();
455 $group_by = false;
456 $m_num = 0;
457 $w_num = 0;
458
459 $sql_array = array(
460 'SELECT' => ($type == 'posts') ? 'p.post_id' : 'p.topic_id',
461 'FROM' => array(
462 SEARCH_WORDMATCH_TABLE => array(),
463 SEARCH_WORDLIST_TABLE => array(),
464 ),
465 'LEFT_JOIN' => array(array(
466 'FROM' => array(POSTS_TABLE => 'p'),
467 'ON' => 'm0.post_id = p.post_id',
468 )),
469 );
470
471 $title_match = '';
472 $left_join_topics = false;
473 $group_by = true;
474 // Build some display specific sql strings
475 switch ($fields)
476 {
477 case 'titleonly':
478 $title_match = 'title_match = 1';
479 $group_by = false;
480 // no break
481 case 'firstpost':
482 $left_join_topics = true;
483 $sql_where[] = 'p.post_id = t.topic_first_post_id';
484 break;
485
486 case 'msgonly':
487 $title_match = 'title_match = 0';
488 $group_by = false;
489 break;
490 }
491
492 if ($type == 'topics')
493 {
494 $left_join_topics = true;
495 $group_by = true;
496 }
497
498 /**
499 * @todo Add a query optimizer (handle stuff like "+(4|3) +4")
500 */
501
502 foreach ($this->must_contain_ids as $subquery)
503 {
504 if (is_array($subquery))
505 {
506 $group_by = true;
507
508 $word_id_sql = array();
509 $word_ids = array();
510 foreach ($subquery as $id)
511 {
512 if (is_string($id))
513 {
514 $sql_array['LEFT_JOIN'][] = array(
515 'FROM' => array(SEARCH_WORDLIST_TABLE => 'w' . $w_num),
516 'ON' => "w$w_num.word_text LIKE $id"
517 );
518 $word_ids[] = "w$w_num.word_id";
519
520 $w_num++;
521 }
522 else
523 {
524 $word_ids[] = $id;
525 }
526 }
527
528 $sql_where[] = $db->sql_in_set("m$m_num.word_id", $word_ids);
529
530 unset($word_id_sql);
531 unset($word_ids);
532 }
533 else if (is_string($subquery))
534 {
535 $sql_array['FROM'][SEARCH_WORDLIST_TABLE][] = 'w' . $w_num;
536
537 $sql_where[] = "w$w_num.word_text LIKE $subquery";
538 $sql_where[] = "m$m_num.word_id = w$w_num.word_id";
539
540 $group_by = true;
541 $w_num++;
542 }
543 else
544 {
545 $sql_where[] = "m$m_num.word_id = $subquery";
546 }
547
548 $sql_array['FROM'][SEARCH_WORDMATCH_TABLE][] = 'm' . $m_num;
549
550 if ($title_match)
551 {
552 $sql_where[] = "m$m_num.$title_match";
553 }
554
555 if ($m_num != 0)
556 {
557 $sql_where[] = "m$m_num.post_id = m0.post_id";
558 }
559 $m_num++;
560 }
561
562 foreach ($this->must_not_contain_ids as $key => $subquery)
563 {
564 if (is_string($subquery))
565 {
566 $sql_array['LEFT_JOIN'][] = array(
567 'FROM' => array(SEARCH_WORDLIST_TABLE => 'w' . $w_num),
568 'ON' => "w$w_num.word_text LIKE $subquery"
569 );
570
571 $this->must_not_contain_ids[$key] = "w$w_num.word_id";
572
573 $group_by = true;
574 $w_num++;
575 }
576 }
577
578 if (sizeof($this->must_not_contain_ids))
579 {
580 $sql_array['LEFT_JOIN'][] = array(
581 'FROM' => array(SEARCH_WORDMATCH_TABLE => 'm' . $m_num),
582 'ON' => $db->sql_in_set("m$m_num.word_id", $this->must_not_contain_ids) . (($title_match) ? " AND m$m_num.$title_match" : '') . " AND m$m_num.post_id = m0.post_id"
583 );
584
585 $sql_where[] = "m$m_num.word_id IS NULL";
586 $m_num++;
587 }
588
589 foreach ($this->must_exclude_one_ids as $ids)
590 {
591 $is_null_joins = array();
592 foreach ($ids as $id)
593 {
594 if (is_string($id))
595 {
596 $sql_array['LEFT_JOIN'][] = array(
597 'FROM' => array(SEARCH_WORDLIST_TABLE => 'w' . $w_num),
598 'ON' => "w$w_num.word_text LIKE $id"
599 );
600 $id = "w$w_num.word_id";
601
602 $group_by = true;
603 $w_num++;
604 }
605
606 $sql_array['LEFT_JOIN'][] = array(
607 'FROM' => array(SEARCH_WORDMATCH_TABLE => 'm' . $m_num),
608 'ON' => "m$m_num.word_id = $id AND m$m_num.post_id = m0.post_id" . (($title_match) ? " AND m$m_num.$title_match" : '')
609 );
610 $is_null_joins[] = "m$m_num.word_id IS NULL";
611
612 $m_num++;
613 }
614 $sql_where[] = '(' . implode(' OR ', $is_null_joins) . ')';
615 }
616
617 if (!sizeof($m_approve_fid_ary))
618 {
619 $sql_where[] = 'p.post_approved = 1';
620 }
621 else if ($m_approve_fid_ary !== array(-1))
622 {
623 $sql_where[] = '(p.post_approved = 1 OR ' . $db->sql_in_set('p.forum_id', $m_approve_fid_ary, true) . ')';
624 }
625
626 if ($topic_id)
627 {
628 $sql_where[] = 'p.topic_id = ' . $topic_id;
629 }
630
631 if (sizeof($author_ary))
632 {
633 if ($author_name)
634 {
635 // first one matches post of registered users, second one guests and deleted users
636 $sql_author = '(' . $db->sql_in_set('p.poster_id', array_diff($author_ary, array(ANONYMOUS)), false, true) . ' OR p.post_username ' . $author_name . ')';
637 }
638 else
639 {
640 $sql_author = $db->sql_in_set('p.poster_id', $author_ary);
641 }
642 $sql_where[] = $sql_author;
643 }
644
645 if (sizeof($ex_fid_ary))
646 {
647 $sql_where[] = $db->sql_in_set('p.forum_id', $ex_fid_ary, true);
648 }
649
650 if ($sort_days)
651 {
652 $sql_where[] = 'p.post_time >= ' . (time() - ($sort_days * 86400));
653 }
654
655 $sql_array['WHERE'] = implode(' AND ', $sql_where);
656
657 $is_mysql = false;
658 // if the total result count is not cached yet, retrieve it from the db
659 if (!$total_results)
660 {
661 $sql = '';
662 $sql_array_count = $sql_array;
663
664 if ($left_join_topics)
665 {
666 $sql_array_count['LEFT_JOIN'][] = array(
667 'FROM' => array(TOPICS_TABLE => 't'),
668 'ON' => 'p.topic_id = t.topic_id'
669 );
670 }
671
672 switch ($db->sql_layer)
673 {
674 case 'mysql4':
675 case 'mysqli':
676
677 // 3.x does not support SQL_CALC_FOUND_ROWS
678 // $sql_array['SELECT'] = 'SQL_CALC_FOUND_ROWS ' . $sql_array['SELECT'];
679 $is_mysql = true;
680
681 break;
682
683 case 'sqlite':
684 $sql_array_count['SELECT'] = ($type == 'posts') ? 'DISTINCT p.post_id' : 'DISTINCT p.topic_id';
685 $sql = 'SELECT COUNT(' . (($type == 'posts') ? 'post_id' : 'topic_id') . ') as total_results
686 FROM (' . $db->sql_build_query('SELECT', $sql_array_count) . ')';
687
688 // no break
689
690 default:
691 $sql_array_count['SELECT'] = ($type == 'posts') ? 'COUNT(DISTINCT p.post_id) AS total_results' : 'COUNT(DISTINCT p.topic_id) AS total_results';
692 $sql = (!$sql) ? $db->sql_build_query('SELECT', $sql_array_count) : $sql;
693
694 $result = $db->sql_query($sql);
695 $total_results = (int) $db->sql_fetchfield('total_results');
696 $db->sql_freeresult($result);
697
698 if (!$total_results)
699 {
700 return false;
701 }
702 break;
703 }
704
705 unset($sql_array_count, $sql);
706 }
707
708 // Build sql strings for sorting
709 $sql_sort = $sort_by_sql[$sort_key] . (($sort_dir == 'a') ? ' ASC' : ' DESC');
710
711 switch ($sql_sort[0])
712 {
713 case 'u':
714 $sql_array['FROM'][USERS_TABLE] = 'u';
715 $sql_where[] = 'u.user_id = p.poster_id ';
716 break;
717
718 case 't':
719 $left_join_topics = true;
720 break;
721
722 case 'f':
723 $sql_array['FROM'][FORUMS_TABLE] = 'f';
724 $sql_where[] = 'f.forum_id = p.forum_id';
725 break;
726 }
727
728 if ($left_join_topics)
729 {
730 $sql_array['LEFT_JOIN'][] = array(
731 'FROM' => array(TOPICS_TABLE => 't'),
732 'ON' => 'p.topic_id = t.topic_id'
733 );
734 }
735
736 $sql_array['WHERE'] = implode(' AND ', $sql_where);
737 $sql_array['GROUP_BY'] = ($group_by) ? (($type == 'posts') ? 'p.post_id' : 'p.topic_id') . ', ' . $sort_by_sql[$sort_key] : '';
738 $sql_array['ORDER_BY'] = $sql_sort;
739
740 unset($sql_where, $sql_sort, $group_by);
741
742 $sql = $db->sql_build_query('SELECT', $sql_array);
743 $result = $db->sql_query_limit($sql, $config['search_block_size'], $start);
744
745 while ($row = $db->sql_fetchrow($result))
746 {
747 $id_ary[] = (int) $row[(($type == 'posts') ? 'post_id' : 'topic_id')];
748 }
749 $db->sql_freeresult($result);
750
751 if (!sizeof($id_ary))
752 {
753 return false;
754 }
755
756 // if we use mysql and the total result count is not cached yet, retrieve it from the db
757 if (!$total_results && $is_mysql)
758 {
759 // Count rows for the executed queries. Replace $select within $sql with SQL_CALC_FOUND_ROWS, and run it.
760 $sql_array_copy = $sql_array;
761 $sql_array_copy['SELECT'] = 'SQL_CALC_FOUND_ROWS p.post_id ';
762
763 $sql = $db->sql_build_query('SELECT', $sql_array_copy);
764 unset($sql_array_copy);
765
766 $db->sql_query($sql);
767 $db->sql_freeresult($result);
768
769 $sql = 'SELECT FOUND_ROWS() as total_results';
770 $result = $db->sql_query($sql);
771 $total_results = (int) $db->sql_fetchfield('total_results');
772 $db->sql_freeresult($result);
773
774 if (!$total_results)
775 {
776 return false;
777 }
778 }
779
780 // store the ids, from start on then delete anything that isn't on the current page because we only need ids for one page
781 $this->save_ids($search_key, $this->search_query, $author_ary, $total_results, $id_ary, $start, $sort_dir);
782 $id_ary = array_slice($id_ary, 0, (int) $per_page);
783
784 return $total_results;
785 }
786
787 /**
788 * Performs a search on an author's posts without caring about message contents. Depends on display specific params
789 *
790 * @param string $type contains either posts or topics depending on what should be searched for
791 * @param boolean $firstpost_only if true, only topic starting posts will be considered
792 * @param array $sort_by_sql contains SQL code for the ORDER BY part of a query
793 * @param string $sort_key is the key of $sort_by_sql for the selected sorting
794 * @param string $sort_dir is either a or d representing ASC and DESC
795 * @param string $sort_days specifies the maximum amount of days a post may be old
796 * @param array $ex_fid_ary specifies an array of forum ids which should not be searched
797 * @param array $m_approve_fid_ary specifies an array of forum ids in which the searcher is allowed to view unapproved posts
798 * @param int $topic_id is set to 0 or a topic id, if it is not 0 then only posts in this topic should be searched
799 * @param array $author_ary an array of author ids
800 * @param string $author_name specifies the author match, when ANONYMOUS is also a search-match
801 * @param array &$id_ary passed by reference, to be filled with ids for the page specified by $start and $per_page, should be ordered
802 * @param int $start indicates the first index of the page
803 * @param int $per_page number of ids each page is supposed to contain
804 * @return boolean|int total number of results
805 *
806 * @access public
807 */
808 function author_search($type, $firstpost_only, $sort_by_sql, $sort_key, $sort_dir, $sort_days, $ex_fid_ary, $m_approve_fid_ary, $topic_id, $author_ary, $author_name, &$id_ary, $start, $per_page)
809 {
810 global $config, $db;
811
812 // No author? No posts.
813 if (!sizeof($author_ary))
814 {
815 return 0;
816 }
817
818 // generate a search_key from all the options to identify the results
819 $search_key = md5(implode('#', array(
820 '',
821 $type,
822 ($firstpost_only) ? 'firstpost' : '',
823 '',
824 '',
825 $sort_days,
826 $sort_key,
827 $topic_id,
828 implode(',', $ex_fid_ary),
829 implode(',', $m_approve_fid_ary),
830 implode(',', $author_ary),
831 $author_name,
832 )));
833
834 // try reading the results from cache
835 $total_results = 0;
836 if ($this->obtain_ids($search_key, $total_results, $id_ary, $start, $per_page, $sort_dir) == SEARCH_RESULT_IN_CACHE)
837 {
838 return $total_results;
839 }
840
841 $id_ary = array();
842
843 // Create some display specific sql strings
844 if ($author_name)
845 {
846 // first one matches post of registered users, second one guests and deleted users
847 $sql_author = '(' . $db->sql_in_set('p.poster_id', array_diff($author_ary, array(ANONYMOUS)), false, true) . ' OR p.post_username ' . $author_name . ')';
848 }
849 else
850 {
851 $sql_author = $db->sql_in_set('p.poster_id', $author_ary);
852 }
853 $sql_fora = (sizeof($ex_fid_ary)) ? ' AND ' . $db->sql_in_set('p.forum_id', $ex_fid_ary, true) : '';
854 $sql_time = ($sort_days) ? ' AND p.post_time >= ' . (time() - ($sort_days * 86400)) : '';
855 $sql_topic_id = ($topic_id) ? ' AND p.topic_id = ' . (int) $topic_id : '';
856 $sql_firstpost = ($firstpost_only) ? ' AND p.post_id = t.topic_first_post_id' : '';
857
858 // Build sql strings for sorting
859 $sql_sort = $sort_by_sql[$sort_key] . (($sort_dir == 'a') ? ' ASC' : ' DESC');
860 $sql_sort_table = $sql_sort_join = '';
861 switch ($sql_sort[0])
862 {
863 case 'u':
864 $sql_sort_table = USERS_TABLE . ' u, ';
865 $sql_sort_join = ' AND u.user_id = p.poster_id ';
866 break;
867
868 case 't':
869 $sql_sort_table = ($type == 'posts' && !$firstpost_only) ? TOPICS_TABLE . ' t, ' : '';
870 $sql_sort_join = ($type == 'posts' && !$firstpost_only) ? ' AND t.topic_id = p.topic_id ' : '';
871 break;
872
873 case 'f':
874 $sql_sort_table = FORUMS_TABLE . ' f, ';
875 $sql_sort_join = ' AND f.forum_id = p.forum_id ';
876 break;
877 }
878
879 if (!sizeof($m_approve_fid_ary))
880 {
881 $m_approve_fid_sql = ' AND p.post_approved = 1';
882 }
883 else if ($m_approve_fid_ary == array(-1))
884 {
885 $m_approve_fid_sql = '';
886 }
887 else
888 {
889 $m_approve_fid_sql = ' AND (p.post_approved = 1 OR ' . $db->sql_in_set('p.forum_id', $m_approve_fid_ary, true) . ')';
890 }
891
892 $select = ($type == 'posts') ? 'p.post_id' : 't.topic_id';
893 $is_mysql = false;
894
895 // If the cache was completely empty count the results
896 if (!$total_results)
897 {
898 switch ($db->sql_layer)
899 {
900 case 'mysql4':
901 case 'mysqli':
902// $select = 'SQL_CALC_FOUND_ROWS ' . $select;
903 $is_mysql = true;
904 break;
905
906 default:
907 if ($type == 'posts')
908 {
909 $sql = 'SELECT COUNT(p.post_id) as total_results
910 FROM ' . POSTS_TABLE . ' p' . (($firstpost_only) ? ', ' . TOPICS_TABLE . ' t ' : ' ') . "
911 WHERE $sql_author
912 $sql_topic_id
913 $sql_firstpost
914 $m_approve_fid_sql
915 $sql_fora
916 $sql_time";
917 }
918 else
919 {
920 if ($db->sql_layer == 'sqlite')
921 {
922 $sql = 'SELECT COUNT(topic_id) as total_results
923 FROM (SELECT DISTINCT t.topic_id';
924 }
925 else
926 {
927 $sql = 'SELECT COUNT(DISTINCT t.topic_id) as total_results';
928 }
929
930 $sql .= ' FROM ' . TOPICS_TABLE . ' t, ' . POSTS_TABLE . " p
931 WHERE $sql_author
932 $sql_topic_id
933 $sql_firstpost
934 $m_approve_fid_sql
935 $sql_fora
936 AND t.topic_id = p.topic_id
937 $sql_time" . (($db->sql_layer == 'sqlite') ? ')' : '');
938 }
939 $result = $db->sql_query($sql);
940
941 $total_results = (int) $db->sql_fetchfield('total_results');
942 $db->sql_freeresult($result);
943
944 if (!$total_results)
945 {
946 return false;
947 }
948 break;
949 }
950 }
951
952 // Build the query for really selecting the post_ids
953 if ($type == 'posts')
954 {
955 $sql = "SELECT $select
956 FROM " . $sql_sort_table . POSTS_TABLE . ' p' . (($firstpost_only) ? ', ' . TOPICS_TABLE . ' t' : '') . "
957 WHERE $sql_author
958 $sql_topic_id
959 $sql_firstpost
960 $m_approve_fid_sql
961 $sql_fora
962 $sql_sort_join
963 $sql_time
964 ORDER BY $sql_sort";
965 $field = 'post_id';
966 }
967 else
968 {
969 $sql = "SELECT $select
970 FROM " . $sql_sort_table . TOPICS_TABLE . ' t, ' . POSTS_TABLE . " p
971 WHERE $sql_author
972 $sql_topic_id
973 $sql_firstpost
974 $m_approve_fid_sql
975 $sql_fora
976 AND t.topic_id = p.topic_id
977 $sql_sort_join
978 $sql_time
979 GROUP BY t.topic_id, " . $sort_by_sql[$sort_key] . '
980 ORDER BY ' . $sql_sort;
981 $field = 'topic_id';
982 }
983
984 // Only read one block of posts from the db and then cache it
985 $result = $db->sql_query_limit($sql, $config['search_block_size'], $start);
986
987 while ($row = $db->sql_fetchrow($result))
988 {
989 $id_ary[] = (int) $row[$field];
990 }
991 $db->sql_freeresult($result);
992
993 if (!$total_results && $is_mysql)
994 {
995 // Count rows for the executed queries. Replace $select within $sql with SQL_CALC_FOUND_ROWS, and run it.
996 $sql = str_replace('SELECT ' . $select, 'SELECT DISTINCT SQL_CALC_FOUND_ROWS p.post_id', $sql);
997
998 $db->sql_query($sql);
999 $db->sql_freeresult($result);
1000
1001 $sql = 'SELECT FOUND_ROWS() as total_results';
1002 $result = $db->sql_query($sql);
1003 $total_results = (int) $db->sql_fetchfield('total_results');
1004 $db->sql_freeresult($result);
1005
1006 if (!$total_results)
1007 {
1008 return false;
1009 }
1010 }
1011
1012 if (sizeof($id_ary))
1013 {
1014 $this->save_ids($search_key, '', $author_ary, $total_results, $id_ary, $start, $sort_dir);
1015 $id_ary = array_slice($id_ary, 0, $per_page);
1016
1017 return $total_results;
1018 }
1019 return false;
1020 }
1021
1022 /**
1023 * Split a text into words of a given length
1024 *
1025 * The text is converted to UTF-8, cleaned up, and split. Then, words that
1026 * conform to the defined length range are returned in an array.
1027 *
1028 * NOTE: duplicates are NOT removed from the return array
1029 *
1030 * @param string $text Text to split, encoded in UTF-8
1031 * @return array Array of UTF-8 words
1032 *
1033 * @access private
1034 */
1035 function split_message($text)
1036 {
1037 global $phpbb_root_path, $phpEx, $user;
1038
1039 $match = $words = array();
1040
1041 /**
1042 * Taken from the original code
1043 */
1044 // Do not index code
1045 $match[] = '#\[code(?:=.*?)?(\:?[0-9a-z]{5,})\].*?\[\/code(\:?[0-9a-z]{5,})\]#is';
1046 // BBcode
1047 $match[] = '#\[\/?[a-z0-9\*\+\-]+(?:=.*?)?(?::[a-z])?(\:?[0-9a-z]{5,})\]#';
1048
1049 $min = $this->word_length['min'];
1050 $max = $this->word_length['max'];
1051
1052 $isset_min = $min - 1;
1053
1054 /**
1055 * Clean up the string, remove HTML tags, remove BBCodes
1056 */
1057 $word = strtok($this->cleanup(preg_replace($match, ' ', strip_tags($text)), -1), ' ');
1058
1059 while (strlen($word))
1060 {
1061 if (strlen($word) > 255 || strlen($word) <= $isset_min)
1062 {
1063 /**
1064 * Words longer than 255 bytes are ignored. This will have to be
1065 * changed whenever we change the length of search_wordlist.word_text
1066 *
1067 * Words shorter than $isset_min bytes are ignored, too
1068 */
1069 $word = strtok(' ');
1070 continue;
1071 }
1072
1073 $len = utf8_strlen($word);
1074
1075 /**
1076 * Test whether the word is too short to be indexed.
1077 *
1078 * Note that this limit does NOT apply to CJK and Hangul
1079 */
1080 if ($len < $min)
1081 {
1082 /**
1083 * Note: this could be optimized. If the codepoint is lower than Hangul's range
1084 * we know that it will also be lower than CJK ranges
1085 */
1086 if ((strncmp($word, UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, UTF8_HANGUL_LAST, 3) > 0)
1087 && (strncmp($word, UTF8_CJK_FIRST, 3) < 0 || strncmp($word, UTF8_CJK_LAST, 3) > 0)
1088 && (strncmp($word, UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, UTF8_CJK_B_LAST, 4) > 0))
1089 {
1090 $word = strtok(' ');
1091 continue;
1092 }
1093 }
1094
1095 $words[] = $word;
1096 $word = strtok(' ');
1097 }
1098
1099 return $words;
1100 }
1101
1102 /**
1103 * Updates wordlist and wordmatch tables when a message is posted or changed
1104 *
1105 * @param string $mode Contains the post mode: edit, post, reply, quote
1106 * @param int $post_id The id of the post which is modified/created
1107 * @param string &$message New or updated post content
1108 * @param string &$subject New or updated post subject
1109 * @param int $poster_id Post author's user id
1110 * @param int $forum_id The id of the forum in which the post is located
1111 *
1112 * @access public
1113 */
1114 function index($mode, $post_id, &$message, &$subject, $poster_id, $forum_id)
1115 {
1116 global $config, $db, $user;
1117
1118 if (!$config['fulltext_native_load_upd'])
1119 {
1120 /**
1121 * The search indexer is disabled, return
1122 */
1123 return;
1124 }
1125
1126 // Split old and new post/subject to obtain array of 'words'
1127 $split_text = $this->split_message($message);
1128 $split_title = $this->split_message($subject);
1129
1130 $cur_words = array('post' => array(), 'title' => array());
1131
1132 $words = array();
1133 if ($mode == 'edit')
1134 {
1135 $words['add']['post'] = array();
1136 $words['add']['title'] = array();
1137 $words['del']['post'] = array();
1138 $words['del']['title'] = array();
1139
1140 $sql = 'SELECT w.word_id, w.word_text, m.title_match
1141 FROM ' . SEARCH_WORDLIST_TABLE . ' w, ' . SEARCH_WORDMATCH_TABLE . " m
1142 WHERE m.post_id = $post_id
1143 AND w.word_id = m.word_id";
1144 $result = $db->sql_query($sql);
1145
1146 while ($row = $db->sql_fetchrow($result))
1147 {
1148 $which = ($row['title_match']) ? 'title' : 'post';
1149 $cur_words[$which][$row['word_text']] = $row['word_id'];
1150 }
1151 $db->sql_freeresult($result);
1152
1153 $words['add']['post'] = array_diff($split_text, array_keys($cur_words['post']));
1154 $words['add']['title'] = array_diff($split_title, array_keys($cur_words['title']));
1155 $words['del']['post'] = array_diff(array_keys($cur_words['post']), $split_text);
1156 $words['del']['title'] = array_diff(array_keys($cur_words['title']), $split_title);
1157 }
1158 else
1159 {
1160 $words['add']['post'] = $split_text;
1161 $words['add']['title'] = $split_title;
1162 $words['del']['post'] = array();
1163 $words['del']['title'] = array();
1164 }
1165 unset($split_text);
1166 unset($split_title);
1167
1168 // Get unique words from the above arrays
1169 $unique_add_words = array_unique(array_merge($words['add']['post'], $words['add']['title']));
1170
1171 // We now have unique arrays of all words to be added and removed and
1172 // individual arrays of added and removed words for text and title. What
1173 // we need to do now is add the new words (if they don't already exist)
1174 // and then add (or remove) matches between the words and this post
1175 if (sizeof($unique_add_words))
1176 {
1177 $sql = 'SELECT word_id, word_text
1178 FROM ' . SEARCH_WORDLIST_TABLE . '
1179 WHERE ' . $db->sql_in_set('word_text', $unique_add_words);
1180 $result = $db->sql_query($sql);
1181
1182 $word_ids = array();
1183 while ($row = $db->sql_fetchrow($result))
1184 {
1185 $word_ids[$row['word_text']] = $row['word_id'];
1186 }
1187 $db->sql_freeresult($result);
1188 $new_words = array_diff($unique_add_words, array_keys($word_ids));
1189
1190 $db->sql_transaction('begin');
1191 if (sizeof($new_words))
1192 {
1193 $sql_ary = array();
1194
1195 foreach ($new_words as $word)
1196 {
1197 $sql_ary[] = array('word_text' => (string) $word, 'word_count' => 0);
1198 }
1199 $db->sql_return_on_error(true);
1200 $db->sql_multi_insert(SEARCH_WORDLIST_TABLE, $sql_ary);
1201 $db->sql_return_on_error(false);
1202 }
1203 unset($new_words, $sql_ary);
1204 }
1205 else
1206 {
1207 $db->sql_transaction('begin');
1208 }
1209
1210 // now update the search match table, remove links to removed words and add links to new words
1211 foreach ($words['del'] as $word_in => $word_ary)
1212 {
1213 $title_match = ($word_in == 'title') ? 1 : 0;
1214
1215 if (sizeof($word_ary))
1216 {
1217 $sql_in = array();
1218 foreach ($word_ary as $word)
1219 {
1220 $sql_in[] = $cur_words[$word_in][$word];
1221 }
1222
1223 $sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
1224 WHERE ' . $db->sql_in_set('word_id', $sql_in) . '
1225 AND post_id = ' . intval($post_id) . "
1226 AND title_match = $title_match";
1227 $db->sql_query($sql);
1228
1229 $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
1230 SET word_count = word_count - 1
1231 WHERE ' . $db->sql_in_set('word_id', $sql_in) . '
1232 AND word_count > 0';
1233 $db->sql_query($sql);
1234
1235 unset($sql_in);
1236 }
1237 }
1238
1239 $db->sql_return_on_error(true);
1240 foreach ($words['add'] as $word_in => $word_ary)
1241 {
1242 $title_match = ($word_in == 'title') ? 1 : 0;
1243
1244 if (sizeof($word_ary))
1245 {
1246 $sql = 'INSERT INTO ' . SEARCH_WORDMATCH_TABLE . ' (post_id, word_id, title_match)
1247 SELECT ' . (int) $post_id . ', word_id, ' . (int) $title_match . '
1248 FROM ' . SEARCH_WORDLIST_TABLE . '
1249 WHERE ' . $db->sql_in_set('word_text', $word_ary);
1250 $db->sql_query($sql);
1251
1252 $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
1253 SET word_count = word_count + 1
1254 WHERE ' . $db->sql_in_set('word_text', $word_ary);
1255 $db->sql_query($sql);
1256 }
1257 }
1258 $db->sql_return_on_error(false);
1259
1260 $db->sql_transaction('commit');
1261
1262 // destroy cached search results containing any of the words removed or added
1263 $this->destroy_cache(array_unique(array_merge($words['add']['post'], $words['add']['title'], $words['del']['post'], $words['del']['title'])), array($poster_id));
1264
1265 unset($unique_add_words);
1266 unset($words);
1267 unset($cur_words);
1268 }
1269
1270 /**
1271 * Removes entries from the wordmatch table for the specified post_ids
1272 */
1273 function index_remove($post_ids, $author_ids, $forum_ids)
1274 {
1275 global $db;
1276
1277 if (sizeof($post_ids))
1278 {
1279 $sql = 'SELECT w.word_id, w.word_text, m.title_match
1280 FROM ' . SEARCH_WORDMATCH_TABLE . ' m, ' . SEARCH_WORDLIST_TABLE . ' w
1281 WHERE ' . $db->sql_in_set('m.post_id', $post_ids) . '
1282 AND w.word_id = m.word_id';
1283 $result = $db->sql_query($sql);
1284
1285 $message_word_ids = $title_word_ids = $word_texts = array();
1286 while ($row = $db->sql_fetchrow($result))
1287 {
1288 if ($row['title_match'])
1289 {
1290 $title_word_ids[] = $row['word_id'];
1291 }
1292 else
1293 {
1294 $message_word_ids[] = $row['word_id'];
1295 }
1296 $word_texts[] = $row['word_text'];
1297 }
1298 $db->sql_freeresult($result);
1299
1300 if (sizeof($title_word_ids))
1301 {
1302 $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
1303 SET word_count = word_count - 1
1304 WHERE ' . $db->sql_in_set('word_id', $title_word_ids) . '
1305 AND word_count > 0';
1306 $db->sql_query($sql);
1307 }
1308
1309 if (sizeof($message_word_ids))
1310 {
1311 $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
1312 SET word_count = word_count - 1
1313 WHERE ' . $db->sql_in_set('word_id', $message_word_ids) . '
1314 AND word_count > 0';
1315 $db->sql_query($sql);
1316 }
1317
1318 unset($title_word_ids);
1319 unset($message_word_ids);
1320
1321 $sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
1322 WHERE ' . $db->sql_in_set('post_id', $post_ids);
1323 $db->sql_query($sql);
1324 }
1325
1326 $this->destroy_cache(array_unique($word_texts), $author_ids);
1327 }
1328
1329 /**
1330 * Tidy up indexes: Tag 'common words' and remove
1331 * words no longer referenced in the match table
1332 */
1333 function tidy()
1334 {
1335 global $db, $config;
1336
1337 // Is the fulltext indexer disabled? If yes then we need not
1338 // carry on ... it's okay ... I know when I'm not wanted boo hoo
1339 if (!$config['fulltext_native_load_upd'])
1340 {
1341 set_config('search_last_gc', time(), true);
1342 return;
1343 }
1344
1345 $destroy_cache_words = array();
1346
1347 // Remove common words
1348 if ($config['num_posts'] >= 100 && $config['fulltext_native_common_thres'])
1349 {
1350 $common_threshold = ((double) $config['fulltext_native_common_thres']) / 100.0;
1351 // First, get the IDs of common words
1352 $sql = 'SELECT word_id, word_text
1353 FROM ' . SEARCH_WORDLIST_TABLE . '
1354 WHERE word_count > ' . floor($config['num_posts'] * $common_threshold) . '
1355 OR word_common = 1';
1356 $result = $db->sql_query($sql);
1357
1358 $sql_in = array();
1359 while ($row = $db->sql_fetchrow($result))
1360 {
1361 $sql_in[] = $row['word_id'];
1362 $destroy_cache_words[] = $row['word_text'];
1363 }
1364 $db->sql_freeresult($result);
1365
1366 if (sizeof($sql_in))
1367 {
1368 // Flag the words
1369 $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
1370 SET word_common = 1
1371 WHERE ' . $db->sql_in_set('word_id', $sql_in);
1372 $db->sql_query($sql);
1373
1374 // by setting search_last_gc to the new time here we make sure that if a user reloads because the
1375 // following query takes too long, he won't run into it again
1376 set_config('search_last_gc', time(), true);
1377
1378 // Delete the matches
1379 $sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
1380 WHERE ' . $db->sql_in_set('word_id', $sql_in);
1381 $db->sql_query($sql);
1382 }
1383 unset($sql_in);
1384 }
1385
1386 if (sizeof($destroy_cache_words))
1387 {
1388 // destroy cached search results containing any of the words that are now common or were removed
1389 $this->destroy_cache(array_unique($destroy_cache_words));
1390 }
1391
1392 set_config('search_last_gc', time(), true);
1393 }
1394
1395 /**
1396 * Deletes all words from the index
1397 */
1398 function delete_index($acp_module, $u_action)
1399 {
1400 global $db;
1401
1402 switch ($db->sql_layer)
1403 {
1404 case 'sqlite':
1405 case 'firebird':
1406 $db->sql_query('DELETE FROM ' . SEARCH_WORDLIST_TABLE);
1407 $db->sql_query('DELETE FROM ' . SEARCH_WORDMATCH_TABLE);
1408 $db->sql_query('DELETE FROM ' . SEARCH_RESULTS_TABLE);
1409 break;
1410
1411 default:
1412 $db->sql_query('TRUNCATE TABLE ' . SEARCH_WORDLIST_TABLE);
1413 $db->sql_query('TRUNCATE TABLE ' . SEARCH_WORDMATCH_TABLE);
1414 $db->sql_query('TRUNCATE TABLE ' . SEARCH_RESULTS_TABLE);
1415 break;
1416 }
1417 }
1418
1419 /**
1420 * Returns true if both FULLTEXT indexes exist
1421 */
1422 function index_created()
1423 {
1424 if (!sizeof($this->stats))
1425 {
1426 $this->get_stats();
1427 }
1428
1429 return ($this->stats['total_words'] && $this->stats['total_matches']) ? true : false;
1430 }
1431
1432 /**
1433 * Returns an associative array containing information about the indexes
1434 */
1435 function index_stats()
1436 {
1437 global $user;
1438
1439 if (!sizeof($this->stats))
1440 {
1441 $this->get_stats();
1442 }
1443
1444 return array(
1445 $user->lang['TOTAL_WORDS'] => $this->stats['total_words'],
1446 $user->lang['TOTAL_MATCHES'] => $this->stats['total_matches']);
1447 }
1448
1449 function get_stats()
1450 {
1451 global $db;
1452
1453 $sql = 'SELECT COUNT(*) as total_words
1454 FROM ' . SEARCH_WORDLIST_TABLE;
1455 $result = $db->sql_query($sql);
1456 $this->stats['total_words'] = (int) $db->sql_fetchfield('total_words');
1457 $db->sql_freeresult($result);
1458
1459 $sql = 'SELECT COUNT(*) as total_matches
1460 FROM ' . SEARCH_WORDMATCH_TABLE;
1461 $result = $db->sql_query($sql);
1462 $this->stats['total_matches'] = (int) $db->sql_fetchfield('total_matches');
1463 $db->sql_freeresult($result);
1464 }
1465
1466 /**
1467 * Clean up a text to remove non-alphanumeric characters
1468 *
1469 * This method receives a UTF-8 string, normalizes and validates it, replaces all
1470 * non-alphanumeric characters with strings then returns the result.
1471 *
1472 * Any number of "allowed chars" can be passed as a UTF-8 string in NFC.
1473 *
1474 * @param string $text Text to split, in UTF-8 (not normalized or sanitized)
1475 * @param string $allowed_chars String of special chars to allow
1476 * @param string $encoding Text encoding
1477 * @return string Cleaned up text, only alphanumeric chars are left
1478 *
1479 * @todo normalizer::cleanup being able to be used?
1480 */
1481 function cleanup($text, $allowed_chars = null, $encoding = 'utf-8')
1482 {
1483 global $phpbb_root_path, $phpEx;
1484 static $conv = array(), $conv_loaded = array();
1485 $words = $allow = array();
1486
1487 // Convert the text to UTF-8
1488 $encoding = strtolower($encoding);
1489 if ($encoding != 'utf-8')
1490 {
1491 $text = utf8_recode($text, $encoding);
1492 }
1493
1494 $utf_len_mask = array(
1495 "\xC0" => 2,
1496 "\xD0" => 2,
1497 "\xE0" => 3,
1498 "\xF0" => 4
1499 );
1500
1501 /**
1502 * Replace HTML entities and NCRs
1503 */
1504 $text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES);
1505
1506 /**
1507 * Load the UTF-8 normalizer
1508 *
1509 * If we use it more widely, an instance of that class should be held in a
1510 * a global variable instead
1511 */
1512 utf_normalizer::nfc($text);
1513
1514 /**
1515 * The first thing we do is:
1516 *
1517 * - convert ASCII-7 letters to lowercase
1518 * - remove the ASCII-7 non-alpha characters
1519 * - remove the bytes that should not appear in a valid UTF-8 string: 0xC0,
1520 * 0xC1 and 0xF5-0xFF
1521 *
1522 * @todo in theory, the third one is already taken care of during normalization and those chars should have been replaced by Unicode replacement chars
1523 */
1524 $sb_match = "ISTCPAMELRDOJBNHFGVWUQKYXZ\r\n\t!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\xC0\xC1\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF";
1525 $sb_replace = 'istcpamelrdojbnhfgvwuqkyxz ';
1526
1527 /**
1528 * This is the list of legal ASCII chars, it is automatically extended
1529 * with ASCII chars from $allowed_chars
1530 */
1531 $legal_ascii = ' eaisntroludcpmghbfvq10xy2j9kw354867z';
1532
1533 /**
1534 * Prepare an array containing the extra chars to allow
1535 */
1536 if (isset($allowed_chars[0]))
1537 {
1538 $pos = 0;
1539 $len = strlen($allowed_chars);
1540 do
1541 {
1542 $c = $allowed_chars[$pos];
1543
1544 if ($c < "\x80")
1545 {
1546 /**
1547 * ASCII char
1548 */
1549 $sb_pos = strpos($sb_match, $c);
1550 if (is_int($sb_pos))
1551 {
1552 /**
1553 * Remove the char from $sb_match and its corresponding
1554 * replacement in $sb_replace
1555 */
1556 $sb_match = substr($sb_match, 0, $sb_pos) . substr($sb_match, $sb_pos + 1);
1557 $sb_replace = substr($sb_replace, 0, $sb_pos) . substr($sb_replace, $sb_pos + 1);
1558 $legal_ascii .= $c;
1559 }
1560
1561 ++$pos;
1562 }
1563 else
1564 {
1565 /**
1566 * UTF-8 char
1567 */
1568 $utf_len = $utf_len_mask[$c & "\xF0"];
1569 $allow[substr($allowed_chars, $pos, $utf_len)] = 1;
1570 $pos += $utf_len;
1571 }
1572 }
1573 while ($pos < $len);
1574 }
1575
1576 $text = strtr($text, $sb_match, $sb_replace);
1577 $ret = '';
1578
1579 $pos = 0;
1580 $len = strlen($text);
1581
1582 do
1583 {
1584 /**
1585 * Do all consecutive ASCII chars at once
1586 */
1587 if ($spn = strspn($text, $legal_ascii, $pos))
1588 {
1589 $ret .= substr($text, $pos, $spn);
1590 $pos += $spn;
1591 }
1592
1593 if ($pos >= $len)
1594 {
1595 return $ret;
1596 }
1597
1598 /**
1599 * Capture the UTF char
1600 */
1601 $utf_len = $utf_len_mask[$text[$pos] & "\xF0"];
1602 $utf_char = substr($text, $pos, $utf_len);
1603 $pos += $utf_len;
1604
1605 if (($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
1606 || ($utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST)
1607 || ($utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST))
1608 {
1609 /**
1610 * All characters within these ranges are valid
1611 *
1612 * We separate them with a space in order to index each character
1613 * individually
1614 */
1615 $ret .= ' ' . $utf_char . ' ';
1616 continue;
1617 }
1618
1619 if (isset($allow[$utf_char]))
1620 {
1621 /**
1622 * The char is explicitly allowed
1623 */
1624 $ret .= $utf_char;
1625 continue;
1626 }
1627
1628 if (isset($conv[$utf_char]))
1629 {
1630 /**
1631 * The char is mapped to something, maybe to itself actually
1632 */
1633 $ret .= $conv[$utf_char];
1634 continue;
1635 }
1636
1637 /**
1638 * The char isn't mapped, but did we load its conversion table?
1639 *
1640 * The search indexer table is split into blocks. The block number of
1641 * each char is equal to its codepoint right-shifted for 11 bits. It
1642 * means that out of the 11, 16 or 21 meaningful bits of a 2-, 3- or
1643 * 4- byte sequence we only keep the leftmost 0, 5 or 10 bits. Thus,
1644 * all UTF chars encoded in 2 bytes are in the same first block.
1645 */
1646 if (isset($utf_char[2]))
1647 {
1648 if (isset($utf_char[3]))
1649 {
1650 /**
1651 * 1111 0nnn 10nn nnnn 10nx xxxx 10xx xxxx
1652 * 0000 0111 0011 1111 0010 0000
1653 */
1654 $idx = ((ord($utf_char[0]) & 0x07) << 7) | ((ord($utf_char[1]) & 0x3F) << 1) | ((ord($utf_char[2]) & 0x20) >> 5);
1655 }
1656 else
1657 {
1658 /**
1659 * 1110 nnnn 10nx xxxx 10xx xxxx
1660 * 0000 0111 0010 0000
1661 */
1662 $idx = ((ord($utf_char[0]) & 0x07) << 1) | ((ord($utf_char[1]) & 0x20) >> 5);
1663 }
1664 }
1665 else
1666 {
1667 /**
1668 * 110x xxxx 10xx xxxx
1669 * 0000 0000 0000 0000
1670 */
1671 $idx = 0;
1672 }
1673
1674 /**
1675 * Check if the required conv table has been loaded already
1676 */
1677 if (!isset($conv_loaded[$idx]))
1678 {
1679 $conv_loaded[$idx] = 1;
1680 $file = $phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $phpEx;
1681
1682 if (file_exists($file))
1683 {
1684 $conv += include($file);
1685 }
1686 }
1687
1688 if (isset($conv[$utf_char]))
1689 {
1690 $ret .= $conv[$utf_char];
1691 }
1692 else
1693 {
1694 /**
1695 * We add an entry to the conversion table so that we
1696 * don't have to convert to codepoint and perform the checks
1697 * that are above this block
1698 */
1699 $conv[$utf_char] = ' ';
1700 $ret .= ' ';
1701 }
1702 }
1703 while (1);
1704
1705 return $ret;
1706 }
1707
1708 /**
1709 * Returns a list of options for the ACP to display
1710 */
1711 function acp()
1712 {
1713 global $user, $config;
1714
1715
1716 /**
1717 * if we need any options, copied from fulltext_native for now, will have to be adjusted or removed
1718 */
1719
1720 $tpl = '
1721 <dl>
1722 <dt><label for="fulltext_native_load_upd">' . $user->lang['YES_SEARCH_UPDATE'] . ':</label><br /><span>' . $user->lang['YES_SEARCH_UPDATE_EXPLAIN'] . '</span></dt>
1723 <dd><label><input type="radio" id="fulltext_native_load_upd" name="config[fulltext_native_load_upd]" value="1"' . (($config['fulltext_native_load_upd']) ? ' checked="checked"' : '') . ' class="radio" /> ' . $user->lang['YES'] . '</label><label><input type="radio" name="config[fulltext_native_load_upd]" value="0"' . ((!$config['fulltext_native_load_upd']) ? ' checked="checked"' : '') . ' class="radio" /> ' . $user->lang['NO'] . '</label></dd>
1724 </dl>
1725 <dl>
1726 <dt><label for="fulltext_native_min_chars">' . $user->lang['MIN_SEARCH_CHARS'] . ':</label><br /><span>' . $user->lang['MIN_SEARCH_CHARS_EXPLAIN'] . '</span></dt>
1727 <dd><input id="fulltext_native_min_chars" type="text" size="3" maxlength="3" name="config[fulltext_native_min_chars]" value="' . (int) $config['fulltext_native_min_chars'] . '" /></dd>
1728 </dl>
1729 <dl>
1730 <dt><label for="fulltext_native_max_chars">' . $user->lang['MAX_SEARCH_CHARS'] . ':</label><br /><span>' . $user->lang['MAX_SEARCH_CHARS_EXPLAIN'] . '</span></dt>
1731 <dd><input id="fulltext_native_max_chars" type="text" size="3" maxlength="3" name="config[fulltext_native_max_chars]" value="' . (int) $config['fulltext_native_max_chars'] . '" /></dd>
1732 </dl>
1733 <dl>
1734 <dt><label for="fulltext_native_common_thres">' . $user->lang['COMMON_WORD_THRESHOLD'] . ':</label><br /><span>' . $user->lang['COMMON_WORD_THRESHOLD_EXPLAIN'] . '</span></dt>
1735 <dd><input id="fulltext_native_common_thres" type="text" size="3" maxlength="3" name="config[fulltext_native_common_thres]" value="' . (double) $config['fulltext_native_common_thres'] . '" /> %</dd>
1736 </dl>
1737 ';
1738
1739 // These are fields required in the config table
1740 return array(
1741 'tpl' => $tpl,
1742 'config' => array('fulltext_native_load_upd' => 'bool', 'fulltext_native_min_chars' => 'integer:0:255', 'fulltext_native_max_chars' => 'integer:0:255', 'fulltext_native_common_thres' => 'double:0:100')
1743 );
1744 }
1745}
1746
1747?>
Note: See TracBrowser for help on using the repository browser.