source: trunk/forum/includes/utf/utf_normalizer.php

Last change on this file was 400, checked in by george, 16 years ago
  • Přidáno: Nové forum phpBB 3.
File size: 41.8 KB
Line 
1<?php
2/**
3*
4* @package utf
5* @version $Id: utf_normalizer.php 8479 2008-03-29 00:22:48Z naderman $
6* @copyright (c) 2005 phpBB Group
7* @license http://opensource.org/licenses/gpl-license.php GNU Public License
8*
9*/
10
11/**
12*/
13if (!defined('IN_PHPBB'))
14{
15 exit;
16}
17
18/**
19* Some Unicode characters encoded in UTF-8
20*
21* Preserved for compatibility
22*/
23define('UTF8_REPLACEMENT', "\xEF\xBF\xBD");
24define('UTF8_MAX', "\xF4\x8F\xBF\xBF");
25define('UTF8_FFFE', "\xEF\xBF\xBE");
26define('UTF8_FFFF', "\xEF\xBF\xBF");
27define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80");
28define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");
29define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");
30define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");
31
32define('UTF8_CJK_FIRST', "\xE4\xB8\x80");
33define('UTF8_CJK_LAST', "\xE9\xBE\xBB");
34define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");
35define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");
36
37// Unset global variables
38unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
39
40// NFC_QC and NFKC_QC values
41define('UNICODE_QC_MAYBE', 0);
42define('UNICODE_QC_NO', 1);
43
44// Contains all the ASCII characters appearing in UTF-8, sorted by frequency
45define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F");
46
47// Contains all the tail bytes that can appear in the composition of a UTF-8 char
48define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A");
49
50// Constants used by the Hangul [de]composition algorithms
51define('UNICODE_HANGUL_SBASE', 0xAC00);
52define('UNICODE_HANGUL_LBASE', 0x1100);
53define('UNICODE_HANGUL_VBASE', 0x1161);
54define('UNICODE_HANGUL_TBASE', 0x11A7);
55define('UNICODE_HANGUL_SCOUNT', 11172);
56define('UNICODE_HANGUL_LCOUNT', 19);
57define('UNICODE_HANGUL_VCOUNT', 21);
58define('UNICODE_HANGUL_TCOUNT', 28);
59define('UNICODE_HANGUL_NCOUNT', 588);
60define('UNICODE_JAMO_L', 0);
61define('UNICODE_JAMO_V', 1);
62define('UNICODE_JAMO_T', 2);
63
64/**
65* Unicode normalization routines
66*
67* @package utf
68*/
69class utf_normalizer
70{
71 /**
72 * Validate, cleanup and normalize a string
73 *
74 * The ultimate convenience function! Clean up invalid UTF-8 sequences,
75 * and convert to Normal Form C, canonical composition.
76 *
77 * @param string &$str The dirty string
78 * @return string The same string, all shiny and cleaned-up
79 */
80 function cleanup(&$str)
81 {
82 // The string below is the list of all autorized characters, sorted by frequency in latin text
83 $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
84 $len = strlen($str);
85
86 if ($pos == $len)
87 {
88 // ASCII strings with no special chars return immediately
89 return;
90 }
91
92 // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
93 if (!isset($GLOBALS['utf_nfc_qc']))
94 {
95 global $phpbb_root_path, $phpEx;
96 include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
97 }
98
99 if (!isset($GLOBALS['utf_canonical_decomp']))
100 {
101 global $phpbb_root_path, $phpEx;
102 include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
103 }
104
105 // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
106 // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
107 $str = strtr(
108 $str,
109 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
110 "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
111 );
112
113 $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
114 }
115
116 /**
117 * Validate and normalize a UTF string to NFC
118 *
119 * @param string &$str Unchecked UTF string
120 * @return string The string, validated and in normal form
121 */
122 function nfc(&$str)
123 {
124 $pos = strspn($str, UTF8_ASCII_RANGE);
125 $len = strlen($str);
126
127 if ($pos == $len)
128 {
129 // ASCII strings return immediately
130 return;
131 }
132
133 if (!isset($GLOBALS['utf_nfc_qc']))
134 {
135 global $phpbb_root_path, $phpEx;
136 include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
137 }
138
139 if (!isset($GLOBALS['utf_canonical_decomp']))
140 {
141 global $phpbb_root_path, $phpEx;
142 include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
143 }
144
145 $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
146 }
147
148 /**
149 * Validate and normalize a UTF string to NFKC
150 *
151 * @param string &$str Unchecked UTF string
152 * @return string The string, validated and in normal form
153 */
154 function nfkc(&$str)
155 {
156 $pos = strspn($str, UTF8_ASCII_RANGE);
157 $len = strlen($str);
158
159 if ($pos == $len)
160 {
161 // ASCII strings return immediately
162 return;
163 }
164
165 if (!isset($GLOBALS['utf_nfkc_qc']))
166 {
167 global $phpbb_root_path, $phpEx;
168 include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);
169 }
170
171 if (!isset($GLOBALS['utf_compatibility_decomp']))
172 {
173 global $phpbb_root_path, $phpEx;
174 include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
175 }
176
177 $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
178 }
179
180 /**
181 * Validate and normalize a UTF string to NFD
182 *
183 * @param string &$str Unchecked UTF string
184 * @return string The string, validated and in normal form
185 */
186 function nfd(&$str)
187 {
188 $pos = strspn($str, UTF8_ASCII_RANGE);
189 $len = strlen($str);
190
191 if ($pos == $len)
192 {
193 // ASCII strings return immediately
194 return;
195 }
196
197 if (!isset($GLOBALS['utf_canonical_decomp']))
198 {
199 global $phpbb_root_path, $phpEx;
200 include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
201 }
202
203 $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
204 }
205
206 /**
207 * Validate and normalize a UTF string to NFKD
208 *
209 * @param string &$str Unchecked UTF string
210 * @return string The string, validated and in normal form
211 */
212 function nfkd(&$str)
213 {
214 $pos = strspn($str, UTF8_ASCII_RANGE);
215 $len = strlen($str);
216
217 if ($pos == $len)
218 {
219 // ASCII strings return immediately
220 return;
221 }
222
223 if (!isset($GLOBALS['utf_compatibility_decomp']))
224 {
225 global $phpbb_root_path, $phpEx;
226 include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
227 }
228
229 $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
230 }
231
232
233 /**
234 * Recompose a UTF string
235 *
236 * @param string $str Unchecked UTF string
237 * @param integer $pos Position of the first UTF char (in bytes)
238 * @param integer $len Length of the string (in bytes)
239 * @param array &$qc Quick-check array, passed by reference but never modified
240 * @param array &$decomp_map Decomposition mapping, passed by reference but never modified
241 * @return string The string, validated and recomposed
242 *
243 * @access private
244 */
245 function recompose($str, $pos, $len, &$qc, &$decomp_map)
246 {
247 global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index;
248
249 // Load some commonly-used tables
250 if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class))
251 {
252 global $phpbb_root_path, $phpEx;
253 include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
254 }
255
256 // Load the canonical composition table
257 if (!isset($utf_canonical_comp))
258 {
259 global $phpbb_root_path, $phpEx;
260 include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
261 }
262
263 // Buffer the last ASCII char before the UTF-8 stuff if applicable
264 $tmp = '';
265 $i = $tmp_pos = $last_cc = 0;
266
267 $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();
268
269 // UTF char length array
270 // This array is used to determine the length of a UTF character.
271 // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos
272 // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.
273 // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character
274 // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
275 $utf_len_mask = array(
276 // Leading bytes masks
277 "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
278 // Trailing bytes masks
279 "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
280 );
281
282 $extra_check = array(
283 "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
284 "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
285 "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
286 );
287
288 $utf_validation_mask = array(
289 2 => "\xE0\xC0",
290 3 => "\xF0\xC0\xC0",
291 4 => "\xF8\xC0\xC0\xC0"
292 );
293
294 $utf_validation_check = array(
295 2 => "\xC0\x80",
296 3 => "\xE0\x80\x80",
297 4 => "\xF0\x80\x80\x80"
298 );
299
300 // Main loop
301 do
302 {
303 // STEP 0: Capture the current char and buffer it
304 $c = $str[$pos];
305 $c_mask = $c & "\xF0";
306
307 if (isset($utf_len_mask[$c_mask]))
308 {
309 // Byte at $pos is either a leading byte or a missplaced trailing byte
310 if ($utf_len = $utf_len_mask[$c_mask])
311 {
312 // Capture the char
313 $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len);
314
315 // Let's find out if a thorough check is needed
316 if (isset($qc[$utf_char]))
317 {
318 // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block
319 }
320 else if (isset($utf_combining_class[$utf_char]))
321 {
322 if ($utf_combining_class[$utf_char] < $last_cc)
323 {
324 // A combining character that is NOT canonically ordered
325 }
326 else
327 {
328 // A combining character that IS canonically ordered, skip to the next char
329 $last_cc = $utf_combining_class[$utf_char];
330
331 $pos += $utf_len;
332 continue;
333 }
334 }
335 else
336 {
337 // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.
338 // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out
339 $last_cc = 0;
340
341 // Check that we have the correct number of trailing bytes
342 if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
343 {
344 // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
345 // has been encoded in a five- or six- byte sequence
346 if ($utf_char[0] >= "\xF8")
347 {
348 if ($utf_char[0] < "\xFC")
349 {
350 $trailing_bytes = 4;
351 }
352 else if ($utf_char[0] > "\xFD")
353 {
354 $trailing_bytes = 0;
355 }
356 else
357 {
358 $trailing_bytes = 5;
359 }
360 }
361 else
362 {
363 $trailing_bytes = $utf_len - 1;
364 }
365
366 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
367 $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
368 $tmp_pos = $pos;
369
370 continue;
371 }
372
373 if (isset($extra_check[$c]))
374 {
375 switch ($c)
376 {
377 // Note: 0xED is quite common in Korean
378 case "\xED":
379 if ($utf_char >= "\xED\xA0\x80")
380 {
381 // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
382 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
383 $pos += $utf_len;
384 $tmp_pos = $pos;
385 continue 2;
386 }
387 break;
388
389 // Note: 0xEF is quite common in Japanese
390 case "\xEF":
391 if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
392 {
393 // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
394 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
395 $pos += $utf_len;
396 $tmp_pos = $pos;
397 continue 2;
398 }
399 break;
400
401 case "\xC0":
402 case "\xC1":
403 if ($utf_char <= "\xC1\xBF")
404 {
405 // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
406 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
407 $pos += $utf_len;
408 $tmp_pos = $pos;
409 continue 2;
410 }
411 break;
412
413 case "\xE0":
414 if ($utf_char <= "\xE0\x9F\xBF")
415 {
416 // Unicode char U+0000..U+07FF encoded in 3 bytes
417 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
418 $pos += $utf_len;
419 $tmp_pos = $pos;
420 continue 2;
421 }
422 break;
423
424 case "\xF0":
425 if ($utf_char <= "\xF0\x8F\xBF\xBF")
426 {
427 // Unicode char U+0000..U+FFFF encoded in 4 bytes
428 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
429 $pos += $utf_len;
430 $tmp_pos = $pos;
431 continue 2;
432 }
433 break;
434
435 default:
436 // Five- and six- byte sequences do not need being checked for here anymore
437 if ($utf_char > UTF8_MAX)
438 {
439 // Out of the Unicode range
440 if ($utf_char[0] < "\xF8")
441 {
442 $trailing_bytes = 3;
443 }
444 else if ($utf_char[0] < "\xFC")
445 {
446 $trailing_bytes = 4;
447 }
448 else if ($utf_char[0] > "\xFD")
449 {
450 $trailing_bytes = 0;
451 }
452 else
453 {
454 $trailing_bytes = 5;
455 }
456
457 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
458 $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
459 $tmp_pos = $pos;
460 continue 2;
461 }
462 break;
463 }
464 }
465
466 // The char is a valid starter, move the cursor and go on
467 $pos += $utf_len;
468 continue;
469 }
470 }
471 else
472 {
473 // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if
474 // each of them was a Unicode replacement char
475 $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
476 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
477
478 $pos += $spn;
479 $tmp_pos = $pos;
480 continue;
481 }
482
483
484 // STEP 1: Decompose current char
485
486 // We have found a character that is either:
487 // - in the NFC_QC/NFKC_QC list
488 // - a non-starter char that is not canonically ordered
489 //
490 // We are going to capture the shortest UTF sequence that satisfies these two conditions:
491 //
492 // 1 - If the sequence does not start at the begginning of the string, it must begin with a starter,
493 // and that starter must not have the NF[K]C_QC property equal to "MAYBE"
494 //
495 // 2 - If the sequence does not end at the end of the string, it must end with a non-starter and be
496 // immediately followed by a starter that is not on the QC list
497 //
498 $utf_seq = array();
499 $last_cc = 0;
500 $lpos = $pos;
501 $pos += $utf_len;
502
503 if (isset($decomp_map[$utf_char]))
504 {
505 $_pos = 0;
506 $_len = strlen($decomp_map[$utf_char]);
507
508 do
509 {
510 $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];
511
512 if (isset($_utf_len))
513 {
514 $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
515 $_pos += $_utf_len;
516 }
517 else
518 {
519 $utf_seq[] = $decomp_map[$utf_char][$_pos];
520 ++$_pos;
521 }
522 }
523 while ($_pos < $_len);
524 }
525 else
526 {
527 // The char is not decomposable
528 $utf_seq = array($utf_char);
529 }
530
531
532 // STEP 2: Capture the starter
533
534 // Check out the combining class of the first character of the UTF sequence
535 $k = 0;
536 if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)
537 {
538 // Not a starter, inspect previous characters
539 // The last 8 characters are kept in a buffer so that we don't have to capture them everytime.
540 // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,
541 // although it is slower than this method.
542 //
543 // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is
544 // at offset $i) and process them in backward mode until we find a starter.
545 //
546 // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more
547 // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering
548 $starter_found = 0;
549 $j_min = max(1, $i - 7);
550
551 for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)
552 {
553 $utf_char = $buffer[$j & 7];
554 $lpos -= strlen($utf_char);
555
556 if (isset($decomp_map[$utf_char]))
557 {
558 // The char is a composite, decompose for storage
559 $decomp_seq = array();
560 $_pos = 0;
561 $_len = strlen($decomp_map[$utf_char]);
562
563 do
564 {
565 $c = $decomp_map[$utf_char][$_pos];
566 $_utf_len =& $utf_len_mask[$c & "\xF0"];
567
568 if (isset($_utf_len))
569 {
570 $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
571 $_pos += $_utf_len;
572 }
573 else
574 {
575 $decomp_seq[] = $c;
576 ++$_pos;
577 }
578 }
579 while ($_pos < $_len);
580
581 // Prepend the UTF sequence with our decomposed sequence
582 if (isset($decomp_seq[1]))
583 {
584 // The char expanded into several chars
585 $decomp_cnt = sizeof($decomp_seq);
586
587 foreach ($decomp_seq as $decomp_i => $decomp_char)
588 {
589 $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;
590 }
591 $k -= $decomp_cnt;
592 }
593 else
594 {
595 // Decomposed to a single char, easier to prepend
596 $utf_seq[--$k] = $decomp_seq[0];
597 }
598 }
599 else
600 {
601 $utf_seq[--$k] = $utf_char;
602 }
603
604 if (!isset($utf_combining_class[$utf_seq[$k]]))
605 {
606 // We have found our starter
607 $starter_found = 1;
608 break;
609 }
610 }
611
612 if (!$starter_found && $lpos > $tmp_pos)
613 {
614 // The starter was not found in the buffer, let's rewind some more
615 do
616 {
617 // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.
618 $c = $str[--$lpos];
619 $c_mask = $c & "\xF0";
620
621 if (isset($utf_len_mask[$c_mask]))
622 {
623 // UTF byte
624 if ($utf_len = $utf_len_mask[$c_mask])
625 {
626 // UTF *leading* byte
627 $utf_char = substr($str, $lpos, $utf_len);
628
629 if (isset($decomp_map[$utf_char]))
630 {
631 // Decompose the character
632 $decomp_seq = array();
633 $_pos = 0;
634 $_len = strlen($decomp_map[$utf_char]);
635
636 do
637 {
638 $c = $decomp_map[$utf_char][$_pos];
639 $_utf_len =& $utf_len_mask[$c & "\xF0"];
640
641 if (isset($_utf_len))
642 {
643 $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
644 $_pos += $_utf_len;
645 }
646 else
647 {
648 $decomp_seq[] = $c;
649 ++$_pos;
650 }
651 }
652 while ($_pos < $_len);
653
654 // Prepend the UTF sequence with our decomposed sequence
655 if (isset($decomp_seq[1]))
656 {
657 // The char expanded into several chars
658 $decomp_cnt = sizeof($decomp_seq);
659 foreach ($decomp_seq as $decomp_i => $utf_char)
660 {
661 $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;
662 }
663 $k -= $decomp_cnt;
664 }
665 else
666 {
667 // Decomposed to a single char, easier to prepend
668 $utf_seq[--$k] = $decomp_seq[0];
669 }
670 }
671 else
672 {
673 $utf_seq[--$k] = $utf_char;
674 }
675 }
676 }
677 else
678 {
679 // ASCII char
680 $utf_seq[--$k] = $c;
681 }
682 }
683 while ($lpos > $tmp_pos);
684 }
685 }
686
687
688 // STEP 3: Capture following combining modifiers
689
690 while ($pos < $len)
691 {
692 $c_mask = $str[$pos] & "\xF0";
693
694 if (isset($utf_len_mask[$c_mask]))
695 {
696 if ($utf_len = $utf_len_mask[$c_mask])
697 {
698 $utf_char = substr($str, $pos, $utf_len);
699 }
700 else
701 {
702 // A trailing byte came out of nowhere
703 // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop
704 // as if it was a starter (replacement chars ARE starters) and let the next loop replace it
705 break;
706 }
707
708 if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char]))
709 {
710 // Combining character, add it to the sequence and move the cursor
711 if (isset($decomp_map[$utf_char]))
712 {
713 // Decompose the character
714 $_pos = 0;
715 $_len = strlen($decomp_map[$utf_char]);
716
717 do
718 {
719 $c = $decomp_map[$utf_char][$_pos];
720 $_utf_len =& $utf_len_mask[$c & "\xF0"];
721
722 if (isset($_utf_len))
723 {
724 $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
725 $_pos += $_utf_len;
726 }
727 else
728 {
729 $utf_seq[] = $c;
730 ++$_pos;
731 }
732 }
733 while ($_pos < $_len);
734 }
735 else
736 {
737 $utf_seq[] = $utf_char;
738 }
739
740 $pos += $utf_len;
741 }
742 else
743 {
744 // Combining class 0 and no QC, break out of the loop
745 // Note: we do not know if that character is valid. If it's not, the next iteration will replace it
746 break;
747 }
748 }
749 else
750 {
751 // ASCII chars are starters
752 break;
753 }
754 }
755
756
757 // STEP 4: Sort and combine
758
759 // Here we sort...
760 $k_max = $k + sizeof($utf_seq);
761
762 if (!$k && $k_max == 1)
763 {
764 // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop
765 // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases
766// if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))
767// {
768 $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];
769 $tmp_pos = $pos;
770// }
771
772 continue;
773 }
774
775 // ...there we combine
776 if (isset($utf_combining_class[$utf_seq[$k]]))
777 {
778 $starter = $nf_seq = '';
779 }
780 else
781 {
782 $starter = $utf_seq[$k++];
783 $nf_seq = '';
784 }
785 $utf_sort = array();
786
787 // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine
788 // at the end of the string without altering it
789 $utf_seq[] = '';
790
791 do
792 {
793 $utf_char = $utf_seq[$k++];
794
795 if (isset($utf_combining_class[$utf_char]))
796 {
797 $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
798 }
799 else
800 {
801 if (empty($utf_sort))
802 {
803 // No combining characters... check for a composite of the two starters
804 if (isset($utf_canonical_comp[$starter . $utf_char]))
805 {
806 // Good ol' composite character
807 $starter = $utf_canonical_comp[$starter . $utf_char];
808 }
809 else if (isset($utf_jamo_type[$utf_char]))
810 {
811 // Current char is a composable jamo
812 if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)
813 {
814 // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo
815 if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)
816 {
817 // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)
818 $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];
819 ++$k;
820 }
821 else
822 {
823 // L+V jamos, combine to a LV Hangul syllable
824 $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];
825 }
826
827 $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
828 }
829 else
830 {
831 // Non-composable jamo, just add it to the sequence
832 $nf_seq .= $starter;
833 $starter = $utf_char;
834 }
835 }
836 else
837 {
838 // No composite, just add the first starter to the sequence then continue with the other one
839 $nf_seq .= $starter;
840 $starter = $utf_char;
841 }
842 }
843 else
844 {
845 ksort($utf_sort);
846
847 // For each class of combining characters
848 foreach ($utf_sort as $cc => $utf_chars)
849 {
850 $j = 0;
851
852 do
853 {
854 // Look for a composite
855 if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))
856 {
857 // Found a composite, replace the starter
858 $starter = $utf_canonical_comp[$starter . $utf_chars[$j]];
859 unset($utf_sort[$cc][$j]);
860 }
861 else
862 {
863 // No composite, all following characters in that class are blocked
864 break;
865 }
866 }
867 while (isset($utf_sort[$cc][++$j]));
868 }
869
870 // Add the starter to the normalized sequence, followed by non-starters in canonical order
871 $nf_seq .= $starter;
872
873 foreach ($utf_sort as $utf_chars)
874 {
875 if (!empty($utf_chars))
876 {
877 $nf_seq .= implode('', $utf_chars);
878 }
879 }
880
881 // Reset the array and go on
882 $utf_sort = array();
883 $starter = $utf_char;
884 }
885 }
886 }
887 while ($k <= $k_max);
888
889 $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;
890 $tmp_pos = $pos;
891 }
892 else
893 {
894 // Only a ASCII char can make the program get here
895 //
896 // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().
897 //
898 // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on
899 // multi-byte text (where the only ASCII chars are spaces and punctuation)
900 if (++$pos != $len)
901 {
902 if ($str[$pos] < "\x80")
903 {
904 $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
905 $buffer[++$i & 7] = $str[$pos - 1];
906 }
907 else
908 {
909 $buffer[++$i & 7] = $c;
910 }
911 }
912 }
913 }
914 while ($pos < $len);
915
916 // Now is time to return the string
917 if ($tmp_pos)
918 {
919 // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version
920 if ($tmp_pos == $len)
921 {
922 // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
923 return $tmp;
924 }
925 else
926 {
927 // The rightmost chunk of $str has not been appended to $tmp yet
928 return $tmp . substr($str, $tmp_pos);
929 }
930 }
931
932 // The string was already in normal form
933 return $str;
934 }
935
936 /**
937 * Decompose a UTF string
938 *
939 * @param string $str UTF string
940 * @param integer $pos Position of the first UTF char (in bytes)
941 * @param integer $len Length of the string (in bytes)
942 * @param array &$decomp_map Decomposition mapping, passed by reference but never modified
943 * @return string The string, decomposed and sorted canonically
944 *
945 * @access private
946 */
947 function decompose($str, $pos, $len, &$decomp_map)
948 {
949 global $utf_combining_class;
950
951 // Load some commonly-used tables
952 if (!isset($utf_combining_class))
953 {
954 global $phpbb_root_path, $phpEx;
955 include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
956 }
957
958 // UTF char length array
959 $utf_len_mask = array(
960 // Leading bytes masks
961 "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
962 // Trailing bytes masks
963 "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
964 );
965
966 // Some extra checks are triggered on the first byte of a UTF sequence
967 $extra_check = array(
968 "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
969 "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
970 "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
971 );
972
973 // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:
974 // - 2-byte: 110? ???? 10?? ????
975 // - 3-byte: 1110 ???? 10?? ???? 10?? ????
976 // - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
977 // Note that 5- and 6- byte sequences are automatically discarded
978 $utf_validation_mask = array(
979 2 => "\xE0\xC0",
980 3 => "\xF0\xC0\xC0",
981 4 => "\xF8\xC0\xC0\xC0"
982 );
983
984 $utf_validation_check = array(
985 2 => "\xC0\x80",
986 3 => "\xE0\x80\x80",
987 4 => "\xF0\x80\x80\x80"
988 );
989
990 $tmp = '';
991 $starter_pos = $pos;
992 $tmp_pos = $last_cc = $sort = $dump = 0;
993 $utf_sort = array();
994
995
996 // Main loop
997 do
998 {
999 // STEP 0: Capture the current char
1000
1001 $cur_mask = $str[$pos] & "\xF0";
1002 if (isset($utf_len_mask[$cur_mask]))
1003 {
1004 if ($utf_len = $utf_len_mask[$cur_mask])
1005 {
1006 // Multibyte char
1007 $utf_char = substr($str, $pos, $utf_len);
1008 $pos += $utf_len;
1009 }
1010 else
1011 {
1012 // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode
1013 // replacement char and we will advance the cursor
1014 $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
1015
1016 if ($dump)
1017 {
1018 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1019
1020 // Dump combiners
1021 if (!empty($utf_sort))
1022 {
1023 if ($sort)
1024 {
1025 ksort($utf_sort);
1026 }
1027
1028 foreach ($utf_sort as $utf_chars)
1029 {
1030 $tmp .= implode('', $utf_chars);
1031 }
1032 }
1033
1034 $tmp .= str_repeat(UTF8_REPLACEMENT, $spn);
1035 $dump = $sort = 0;
1036 }
1037 else
1038 {
1039 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
1040 }
1041
1042 $pos += $spn;
1043 $tmp_pos = $starter_pos = $pos;
1044
1045 $utf_sort = array();
1046 $last_cc = 0;
1047
1048 continue;
1049 }
1050
1051
1052 // STEP 1: Decide what to do with current char
1053
1054 // Now, in that order:
1055 // - check if that character is decomposable
1056 // - check if that character is a non-starter
1057 // - check if that character requires extra checks to be performed
1058 if (isset($decomp_map[$utf_char]))
1059 {
1060 // Decompose the char
1061 $_pos = 0;
1062 $_len = strlen($decomp_map[$utf_char]);
1063
1064 do
1065 {
1066 $c = $decomp_map[$utf_char][$_pos];
1067 $_utf_len =& $utf_len_mask[$c & "\xF0"];
1068
1069 if (isset($_utf_len))
1070 {
1071 $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);
1072 $_pos += $_utf_len;
1073
1074 if (isset($utf_combining_class[$_utf_char]))
1075 {
1076 // The character decomposed to a non-starter, buffer it for sorting
1077 $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;
1078
1079 if ($utf_combining_class[$_utf_char] < $last_cc)
1080 {
1081 // Not canonically ordered, will require sorting
1082 $sort = $dump = 1;
1083 }
1084 else
1085 {
1086 $dump = 1;
1087 $last_cc = $utf_combining_class[$_utf_char];
1088 }
1089 }
1090 else
1091 {
1092 // This character decomposition contains a starter, dump the buffer and continue
1093 if ($dump)
1094 {
1095 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1096
1097 // Dump combiners
1098 if (!empty($utf_sort))
1099 {
1100 if ($sort)
1101 {
1102 ksort($utf_sort);
1103 }
1104
1105 foreach ($utf_sort as $utf_chars)
1106 {
1107 $tmp .= implode('', $utf_chars);
1108 }
1109 }
1110
1111 $tmp .= $_utf_char;
1112 $dump = $sort = 0;
1113 }
1114 else
1115 {
1116 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;
1117 }
1118
1119 $tmp_pos = $starter_pos = $pos;
1120 $utf_sort = array();
1121 $last_cc = 0;
1122 }
1123 }
1124 else
1125 {
1126 // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue
1127 ++$_pos;
1128
1129 if ($dump)
1130 {
1131 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1132
1133 // Dump combiners
1134 if (!empty($utf_sort))
1135 {
1136 if ($sort)
1137 {
1138 ksort($utf_sort);
1139 }
1140
1141 foreach ($utf_sort as $utf_chars)
1142 {
1143 $tmp .= implode('', $utf_chars);
1144 }
1145 }
1146
1147 $tmp .= $c;
1148 $dump = $sort = 0;
1149 }
1150 else
1151 {
1152 $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;
1153 }
1154
1155 $tmp_pos = $starter_pos = $pos;
1156 $utf_sort = array();
1157 $last_cc = 0;
1158 }
1159 }
1160 while ($_pos < $_len);
1161 }
1162 else if (isset($utf_combining_class[$utf_char]))
1163 {
1164 // Combining character
1165 if ($utf_combining_class[$utf_char] < $last_cc)
1166 {
1167 // Not in canonical order
1168 $sort = $dump = 1;
1169 }
1170 else
1171 {
1172 $last_cc = $utf_combining_class[$utf_char];
1173 }
1174
1175 $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
1176 }
1177 else
1178 {
1179 // Non-decomposable starter, check out if it's a Hangul syllable
1180 if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST)
1181 {
1182 // Nope, regular UTF char, check that we have the correct number of trailing bytes
1183 if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
1184 {
1185 // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
1186 // has been encoded in a five- or six- byte sequence.
1187 // Move the cursor back to its original position then advance it to the position it should really be at
1188 $pos -= $utf_len;
1189 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1190
1191 if (!empty($utf_sort))
1192 {
1193 ksort($utf_sort);
1194
1195 foreach ($utf_sort as $utf_chars)
1196 {
1197 $tmp .= implode('', $utf_chars);
1198 }
1199 $utf_sort = array();
1200 }
1201
1202 // Add a replacement char then another replacement char for every trailing byte.
1203 //
1204 // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this
1205 $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);
1206 $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);
1207
1208 $dump = $sort = 0;
1209
1210 $pos += $spn;
1211 $tmp_pos = $pos;
1212 continue;
1213 }
1214
1215 if (isset($extra_check[$utf_char[0]]))
1216 {
1217 switch ($utf_char[0])
1218 {
1219 // Note: 0xED is quite common in Korean
1220 case "\xED":
1221 if ($utf_char >= "\xED\xA0\x80")
1222 {
1223 // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
1224 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1225
1226 if (!empty($utf_sort))
1227 {
1228 ksort($utf_sort);
1229
1230 foreach ($utf_sort as $utf_chars)
1231 {
1232 $tmp .= implode('', $utf_chars);
1233 }
1234 $utf_sort = array();
1235 }
1236
1237 $tmp .= UTF8_REPLACEMENT;
1238 $dump = $sort = 0;
1239
1240 $tmp_pos = $starter_pos = $pos;
1241 continue 2;
1242 }
1243 break;
1244
1245 // Note: 0xEF is quite common in Japanese
1246 case "\xEF":
1247 if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
1248 {
1249 // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
1250 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1251
1252 if (!empty($utf_sort))
1253 {
1254 ksort($utf_sort);
1255
1256 foreach ($utf_sort as $utf_chars)
1257 {
1258 $tmp .= implode('', $utf_chars);
1259 }
1260 $utf_sort = array();
1261 }
1262
1263 $tmp .= UTF8_REPLACEMENT;
1264 $dump = $sort = 0;
1265
1266 $tmp_pos = $starter_pos = $pos;
1267 continue 2;
1268 }
1269 break;
1270
1271 case "\xC0":
1272 case "\xC1":
1273 if ($utf_char <= "\xC1\xBF")
1274 {
1275 // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
1276 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1277
1278 if (!empty($utf_sort))
1279 {
1280 ksort($utf_sort);
1281
1282 foreach ($utf_sort as $utf_chars)
1283 {
1284 $tmp .= implode('', $utf_chars);
1285 }
1286 $utf_sort = array();
1287 }
1288
1289 $tmp .= UTF8_REPLACEMENT;
1290 $dump = $sort = 0;
1291
1292 $tmp_pos = $starter_pos = $pos;
1293 continue 2;
1294 }
1295 break;
1296
1297 case "\xE0":
1298 if ($utf_char <= "\xE0\x9F\xBF")
1299 {
1300 // Unicode char U+0000..U+07FF encoded in 3 bytes
1301 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1302
1303 if (!empty($utf_sort))
1304 {
1305 ksort($utf_sort);
1306
1307 foreach ($utf_sort as $utf_chars)
1308 {
1309 $tmp .= implode('', $utf_chars);
1310 }
1311 $utf_sort = array();
1312 }
1313
1314 $tmp .= UTF8_REPLACEMENT;
1315 $dump = $sort = 0;
1316
1317 $tmp_pos = $starter_pos = $pos;
1318 continue 2;
1319 }
1320 break;
1321
1322 case "\xF0":
1323 if ($utf_char <= "\xF0\x8F\xBF\xBF")
1324 {
1325 // Unicode char U+0000..U+FFFF encoded in 4 bytes
1326 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1327
1328 if (!empty($utf_sort))
1329 {
1330 ksort($utf_sort);
1331
1332 foreach ($utf_sort as $utf_chars)
1333 {
1334 $tmp .= implode('', $utf_chars);
1335 }
1336 $utf_sort = array();
1337 }
1338
1339 $tmp .= UTF8_REPLACEMENT;
1340 $dump = $sort = 0;
1341
1342 $tmp_pos = $starter_pos = $pos;
1343 continue 2;
1344 }
1345 break;
1346
1347 default:
1348 if ($utf_char > UTF8_MAX)
1349 {
1350 // Out of the Unicode range
1351 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1352
1353 if (!empty($utf_sort))
1354 {
1355 ksort($utf_sort);
1356
1357 foreach ($utf_sort as $utf_chars)
1358 {
1359 $tmp .= implode('', $utf_chars);
1360 }
1361 $utf_sort = array();
1362 }
1363
1364 $tmp .= UTF8_REPLACEMENT;
1365 $dump = $sort = 0;
1366
1367 $tmp_pos = $starter_pos = $pos;
1368 continue 2;
1369 }
1370 break;
1371 }
1372 }
1373 }
1374 else
1375 {
1376 // Hangul syllable
1377 $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;
1378
1379 // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
1380 //
1381 // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
1382 if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)
1383 {
1384 if ($t_index < 25)
1385 {
1386 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
1387 $utf_char[8] = chr(0xA7 + $t_index);
1388 }
1389 else
1390 {
1391 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
1392 $utf_char[8] = chr(0x67 + $t_index);
1393 }
1394 }
1395 else
1396 {
1397 $utf_char = "\xE1\x84\x00\xE1\x85\x00";
1398 }
1399
1400 $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));
1401 $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));
1402
1403 // Just like other decompositions, the resulting Jamos must be dumped to the tmp string
1404 $dump = 1;
1405 }
1406
1407 // Do we need to dump stuff to the tmp string?
1408 if ($dump)
1409 {
1410 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1411
1412 // Dump combiners
1413 if (!empty($utf_sort))
1414 {
1415 if ($sort)
1416 {
1417 ksort($utf_sort);
1418 }
1419
1420 foreach ($utf_sort as $utf_chars)
1421 {
1422 $tmp .= implode('', $utf_chars);
1423 }
1424 }
1425
1426 $tmp .= $utf_char;
1427 $dump = $sort = 0;
1428 $tmp_pos = $pos;
1429 }
1430
1431 $last_cc = 0;
1432 $utf_sort = array();
1433 $starter_pos = $pos;
1434 }
1435 }
1436 else
1437 {
1438 // ASCII char, which happens to be a starter (as any other ASCII char)
1439 if ($dump)
1440 {
1441 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1442
1443 // Dump combiners
1444 if (!empty($utf_sort))
1445 {
1446 if ($sort)
1447 {
1448 ksort($utf_sort);
1449 }
1450
1451 foreach ($utf_sort as $utf_chars)
1452 {
1453 $tmp .= implode('', $utf_chars);
1454 }
1455 }
1456
1457 $tmp .= $str[$pos];
1458 $dump = $sort = 0;
1459 $tmp_pos = ++$pos;
1460
1461 $pos += strspn($str, UTF8_ASCII_RANGE, $pos);
1462 }
1463 else
1464 {
1465 $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
1466 }
1467
1468 $last_cc = 0;
1469 $utf_sort = array();
1470 $starter_pos = $pos;
1471 }
1472 }
1473 while ($pos < $len);
1474
1475 // Now is time to return the string
1476 if ($dump)
1477 {
1478 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1479
1480 // Dump combiners
1481 if (!empty($utf_sort))
1482 {
1483 if ($sort)
1484 {
1485 ksort($utf_sort);
1486 }
1487
1488 foreach ($utf_sort as $utf_chars)
1489 {
1490 $tmp .= implode('', $utf_chars);
1491 }
1492 }
1493
1494 return $tmp;
1495 }
1496 else if ($tmp_pos)
1497 {
1498 // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version
1499 if ($tmp_pos == $len)
1500 {
1501 // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
1502 return $tmp;
1503 }
1504 else
1505 {
1506 // The rightmost chunk of $str has not been appended to $tmp yet
1507 return $tmp . substr($str, $tmp_pos);
1508 }
1509 }
1510
1511 // The string was already in normal form
1512 return $str;
1513 }
1514}
1515
1516?>
Note: See TracBrowser for help on using the repository browser.