1 | <?php
|
---|
2 | /**
|
---|
3 | *
|
---|
4 | * @package utf
|
---|
5 | * @version $Id: utf_normalizer.php 8479 2008-03-29 00:22:48Z naderman $
|
---|
6 | * @copyright (c) 2005 phpBB Group
|
---|
7 | * @license http://opensource.org/licenses/gpl-license.php GNU Public License
|
---|
8 | *
|
---|
9 | */
|
---|
10 |
|
---|
11 | /**
|
---|
12 | */
|
---|
13 | if (!defined('IN_PHPBB'))
|
---|
14 | {
|
---|
15 | exit;
|
---|
16 | }
|
---|
17 |
|
---|
18 | /**
|
---|
19 | * Some Unicode characters encoded in UTF-8
|
---|
20 | *
|
---|
21 | * Preserved for compatibility
|
---|
22 | */
|
---|
23 | define('UTF8_REPLACEMENT', "\xEF\xBF\xBD");
|
---|
24 | define('UTF8_MAX', "\xF4\x8F\xBF\xBF");
|
---|
25 | define('UTF8_FFFE', "\xEF\xBF\xBE");
|
---|
26 | define('UTF8_FFFF', "\xEF\xBF\xBF");
|
---|
27 | define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80");
|
---|
28 | define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");
|
---|
29 | define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");
|
---|
30 | define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");
|
---|
31 |
|
---|
32 | define('UTF8_CJK_FIRST', "\xE4\xB8\x80");
|
---|
33 | define('UTF8_CJK_LAST', "\xE9\xBE\xBB");
|
---|
34 | define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");
|
---|
35 | define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");
|
---|
36 |
|
---|
37 | // Unset global variables
|
---|
38 | unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
|
---|
39 |
|
---|
40 | // NFC_QC and NFKC_QC values
|
---|
41 | define('UNICODE_QC_MAYBE', 0);
|
---|
42 | define('UNICODE_QC_NO', 1);
|
---|
43 |
|
---|
44 | // Contains all the ASCII characters appearing in UTF-8, sorted by frequency
|
---|
45 | define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F");
|
---|
46 |
|
---|
47 | // Contains all the tail bytes that can appear in the composition of a UTF-8 char
|
---|
48 | define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A");
|
---|
49 |
|
---|
50 | // Constants used by the Hangul [de]composition algorithms
|
---|
51 | define('UNICODE_HANGUL_SBASE', 0xAC00);
|
---|
52 | define('UNICODE_HANGUL_LBASE', 0x1100);
|
---|
53 | define('UNICODE_HANGUL_VBASE', 0x1161);
|
---|
54 | define('UNICODE_HANGUL_TBASE', 0x11A7);
|
---|
55 | define('UNICODE_HANGUL_SCOUNT', 11172);
|
---|
56 | define('UNICODE_HANGUL_LCOUNT', 19);
|
---|
57 | define('UNICODE_HANGUL_VCOUNT', 21);
|
---|
58 | define('UNICODE_HANGUL_TCOUNT', 28);
|
---|
59 | define('UNICODE_HANGUL_NCOUNT', 588);
|
---|
60 | define('UNICODE_JAMO_L', 0);
|
---|
61 | define('UNICODE_JAMO_V', 1);
|
---|
62 | define('UNICODE_JAMO_T', 2);
|
---|
63 |
|
---|
64 | /**
|
---|
65 | * Unicode normalization routines
|
---|
66 | *
|
---|
67 | * @package utf
|
---|
68 | */
|
---|
69 | class utf_normalizer
|
---|
70 | {
|
---|
71 | /**
|
---|
72 | * Validate, cleanup and normalize a string
|
---|
73 | *
|
---|
74 | * The ultimate convenience function! Clean up invalid UTF-8 sequences,
|
---|
75 | * and convert to Normal Form C, canonical composition.
|
---|
76 | *
|
---|
77 | * @param string &$str The dirty string
|
---|
78 | * @return string The same string, all shiny and cleaned-up
|
---|
79 | */
|
---|
80 | function cleanup(&$str)
|
---|
81 | {
|
---|
82 | // The string below is the list of all autorized characters, sorted by frequency in latin text
|
---|
83 | $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
|
---|
84 | $len = strlen($str);
|
---|
85 |
|
---|
86 | if ($pos == $len)
|
---|
87 | {
|
---|
88 | // ASCII strings with no special chars return immediately
|
---|
89 | return;
|
---|
90 | }
|
---|
91 |
|
---|
92 | // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
|
---|
93 | if (!isset($GLOBALS['utf_nfc_qc']))
|
---|
94 | {
|
---|
95 | global $phpbb_root_path, $phpEx;
|
---|
96 | include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
|
---|
97 | }
|
---|
98 |
|
---|
99 | if (!isset($GLOBALS['utf_canonical_decomp']))
|
---|
100 | {
|
---|
101 | global $phpbb_root_path, $phpEx;
|
---|
102 | include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
|
---|
103 | }
|
---|
104 |
|
---|
105 | // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
|
---|
106 | // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
|
---|
107 | $str = strtr(
|
---|
108 | $str,
|
---|
109 | "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
|
---|
110 | "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
|
---|
111 | );
|
---|
112 |
|
---|
113 | $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
|
---|
114 | }
|
---|
115 |
|
---|
116 | /**
|
---|
117 | * Validate and normalize a UTF string to NFC
|
---|
118 | *
|
---|
119 | * @param string &$str Unchecked UTF string
|
---|
120 | * @return string The string, validated and in normal form
|
---|
121 | */
|
---|
122 | function nfc(&$str)
|
---|
123 | {
|
---|
124 | $pos = strspn($str, UTF8_ASCII_RANGE);
|
---|
125 | $len = strlen($str);
|
---|
126 |
|
---|
127 | if ($pos == $len)
|
---|
128 | {
|
---|
129 | // ASCII strings return immediately
|
---|
130 | return;
|
---|
131 | }
|
---|
132 |
|
---|
133 | if (!isset($GLOBALS['utf_nfc_qc']))
|
---|
134 | {
|
---|
135 | global $phpbb_root_path, $phpEx;
|
---|
136 | include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
|
---|
137 | }
|
---|
138 |
|
---|
139 | if (!isset($GLOBALS['utf_canonical_decomp']))
|
---|
140 | {
|
---|
141 | global $phpbb_root_path, $phpEx;
|
---|
142 | include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
|
---|
143 | }
|
---|
144 |
|
---|
145 | $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
|
---|
146 | }
|
---|
147 |
|
---|
148 | /**
|
---|
149 | * Validate and normalize a UTF string to NFKC
|
---|
150 | *
|
---|
151 | * @param string &$str Unchecked UTF string
|
---|
152 | * @return string The string, validated and in normal form
|
---|
153 | */
|
---|
154 | function nfkc(&$str)
|
---|
155 | {
|
---|
156 | $pos = strspn($str, UTF8_ASCII_RANGE);
|
---|
157 | $len = strlen($str);
|
---|
158 |
|
---|
159 | if ($pos == $len)
|
---|
160 | {
|
---|
161 | // ASCII strings return immediately
|
---|
162 | return;
|
---|
163 | }
|
---|
164 |
|
---|
165 | if (!isset($GLOBALS['utf_nfkc_qc']))
|
---|
166 | {
|
---|
167 | global $phpbb_root_path, $phpEx;
|
---|
168 | include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);
|
---|
169 | }
|
---|
170 |
|
---|
171 | if (!isset($GLOBALS['utf_compatibility_decomp']))
|
---|
172 | {
|
---|
173 | global $phpbb_root_path, $phpEx;
|
---|
174 | include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
|
---|
175 | }
|
---|
176 |
|
---|
177 | $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
|
---|
178 | }
|
---|
179 |
|
---|
180 | /**
|
---|
181 | * Validate and normalize a UTF string to NFD
|
---|
182 | *
|
---|
183 | * @param string &$str Unchecked UTF string
|
---|
184 | * @return string The string, validated and in normal form
|
---|
185 | */
|
---|
186 | function nfd(&$str)
|
---|
187 | {
|
---|
188 | $pos = strspn($str, UTF8_ASCII_RANGE);
|
---|
189 | $len = strlen($str);
|
---|
190 |
|
---|
191 | if ($pos == $len)
|
---|
192 | {
|
---|
193 | // ASCII strings return immediately
|
---|
194 | return;
|
---|
195 | }
|
---|
196 |
|
---|
197 | if (!isset($GLOBALS['utf_canonical_decomp']))
|
---|
198 | {
|
---|
199 | global $phpbb_root_path, $phpEx;
|
---|
200 | include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
|
---|
201 | }
|
---|
202 |
|
---|
203 | $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
|
---|
204 | }
|
---|
205 |
|
---|
206 | /**
|
---|
207 | * Validate and normalize a UTF string to NFKD
|
---|
208 | *
|
---|
209 | * @param string &$str Unchecked UTF string
|
---|
210 | * @return string The string, validated and in normal form
|
---|
211 | */
|
---|
212 | function nfkd(&$str)
|
---|
213 | {
|
---|
214 | $pos = strspn($str, UTF8_ASCII_RANGE);
|
---|
215 | $len = strlen($str);
|
---|
216 |
|
---|
217 | if ($pos == $len)
|
---|
218 | {
|
---|
219 | // ASCII strings return immediately
|
---|
220 | return;
|
---|
221 | }
|
---|
222 |
|
---|
223 | if (!isset($GLOBALS['utf_compatibility_decomp']))
|
---|
224 | {
|
---|
225 | global $phpbb_root_path, $phpEx;
|
---|
226 | include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
|
---|
227 | }
|
---|
228 |
|
---|
229 | $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
|
---|
230 | }
|
---|
231 |
|
---|
232 |
|
---|
233 | /**
|
---|
234 | * Recompose a UTF string
|
---|
235 | *
|
---|
236 | * @param string $str Unchecked UTF string
|
---|
237 | * @param integer $pos Position of the first UTF char (in bytes)
|
---|
238 | * @param integer $len Length of the string (in bytes)
|
---|
239 | * @param array &$qc Quick-check array, passed by reference but never modified
|
---|
240 | * @param array &$decomp_map Decomposition mapping, passed by reference but never modified
|
---|
241 | * @return string The string, validated and recomposed
|
---|
242 | *
|
---|
243 | * @access private
|
---|
244 | */
|
---|
245 | function recompose($str, $pos, $len, &$qc, &$decomp_map)
|
---|
246 | {
|
---|
247 | global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index;
|
---|
248 |
|
---|
249 | // Load some commonly-used tables
|
---|
250 | if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class))
|
---|
251 | {
|
---|
252 | global $phpbb_root_path, $phpEx;
|
---|
253 | include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
|
---|
254 | }
|
---|
255 |
|
---|
256 | // Load the canonical composition table
|
---|
257 | if (!isset($utf_canonical_comp))
|
---|
258 | {
|
---|
259 | global $phpbb_root_path, $phpEx;
|
---|
260 | include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
|
---|
261 | }
|
---|
262 |
|
---|
263 | // Buffer the last ASCII char before the UTF-8 stuff if applicable
|
---|
264 | $tmp = '';
|
---|
265 | $i = $tmp_pos = $last_cc = 0;
|
---|
266 |
|
---|
267 | $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();
|
---|
268 |
|
---|
269 | // UTF char length array
|
---|
270 | // This array is used to determine the length of a UTF character.
|
---|
271 | // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos
|
---|
272 | // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.
|
---|
273 | // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character
|
---|
274 | // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
|
---|
275 | $utf_len_mask = array(
|
---|
276 | // Leading bytes masks
|
---|
277 | "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
|
---|
278 | // Trailing bytes masks
|
---|
279 | "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
|
---|
280 | );
|
---|
281 |
|
---|
282 | $extra_check = array(
|
---|
283 | "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
|
---|
284 | "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
|
---|
285 | "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
|
---|
286 | );
|
---|
287 |
|
---|
288 | $utf_validation_mask = array(
|
---|
289 | 2 => "\xE0\xC0",
|
---|
290 | 3 => "\xF0\xC0\xC0",
|
---|
291 | 4 => "\xF8\xC0\xC0\xC0"
|
---|
292 | );
|
---|
293 |
|
---|
294 | $utf_validation_check = array(
|
---|
295 | 2 => "\xC0\x80",
|
---|
296 | 3 => "\xE0\x80\x80",
|
---|
297 | 4 => "\xF0\x80\x80\x80"
|
---|
298 | );
|
---|
299 |
|
---|
300 | // Main loop
|
---|
301 | do
|
---|
302 | {
|
---|
303 | // STEP 0: Capture the current char and buffer it
|
---|
304 | $c = $str[$pos];
|
---|
305 | $c_mask = $c & "\xF0";
|
---|
306 |
|
---|
307 | if (isset($utf_len_mask[$c_mask]))
|
---|
308 | {
|
---|
309 | // Byte at $pos is either a leading byte or a missplaced trailing byte
|
---|
310 | if ($utf_len = $utf_len_mask[$c_mask])
|
---|
311 | {
|
---|
312 | // Capture the char
|
---|
313 | $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len);
|
---|
314 |
|
---|
315 | // Let's find out if a thorough check is needed
|
---|
316 | if (isset($qc[$utf_char]))
|
---|
317 | {
|
---|
318 | // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block
|
---|
319 | }
|
---|
320 | else if (isset($utf_combining_class[$utf_char]))
|
---|
321 | {
|
---|
322 | if ($utf_combining_class[$utf_char] < $last_cc)
|
---|
323 | {
|
---|
324 | // A combining character that is NOT canonically ordered
|
---|
325 | }
|
---|
326 | else
|
---|
327 | {
|
---|
328 | // A combining character that IS canonically ordered, skip to the next char
|
---|
329 | $last_cc = $utf_combining_class[$utf_char];
|
---|
330 |
|
---|
331 | $pos += $utf_len;
|
---|
332 | continue;
|
---|
333 | }
|
---|
334 | }
|
---|
335 | else
|
---|
336 | {
|
---|
337 | // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.
|
---|
338 | // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out
|
---|
339 | $last_cc = 0;
|
---|
340 |
|
---|
341 | // Check that we have the correct number of trailing bytes
|
---|
342 | if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
|
---|
343 | {
|
---|
344 | // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
|
---|
345 | // has been encoded in a five- or six- byte sequence
|
---|
346 | if ($utf_char[0] >= "\xF8")
|
---|
347 | {
|
---|
348 | if ($utf_char[0] < "\xFC")
|
---|
349 | {
|
---|
350 | $trailing_bytes = 4;
|
---|
351 | }
|
---|
352 | else if ($utf_char[0] > "\xFD")
|
---|
353 | {
|
---|
354 | $trailing_bytes = 0;
|
---|
355 | }
|
---|
356 | else
|
---|
357 | {
|
---|
358 | $trailing_bytes = 5;
|
---|
359 | }
|
---|
360 | }
|
---|
361 | else
|
---|
362 | {
|
---|
363 | $trailing_bytes = $utf_len - 1;
|
---|
364 | }
|
---|
365 |
|
---|
366 | $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
|
---|
367 | $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
|
---|
368 | $tmp_pos = $pos;
|
---|
369 |
|
---|
370 | continue;
|
---|
371 | }
|
---|
372 |
|
---|
373 | if (isset($extra_check[$c]))
|
---|
374 | {
|
---|
375 | switch ($c)
|
---|
376 | {
|
---|
377 | // Note: 0xED is quite common in Korean
|
---|
378 | case "\xED":
|
---|
379 | if ($utf_char >= "\xED\xA0\x80")
|
---|
380 | {
|
---|
381 | // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
|
---|
382 | $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
|
---|
383 | $pos += $utf_len;
|
---|
384 | $tmp_pos = $pos;
|
---|
385 | continue 2;
|
---|
386 | }
|
---|
387 | break;
|
---|
388 |
|
---|
389 | // Note: 0xEF is quite common in Japanese
|
---|
390 | case "\xEF":
|
---|
391 | if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
|
---|
392 | {
|
---|
393 | // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
|
---|
394 | $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
|
---|
395 | $pos += $utf_len;
|
---|
396 | $tmp_pos = $pos;
|
---|
397 | continue 2;
|
---|
398 | }
|
---|
399 | break;
|
---|
400 |
|
---|
401 | case "\xC0":
|
---|
402 | case "\xC1":
|
---|
403 | if ($utf_char <= "\xC1\xBF")
|
---|
404 | {
|
---|
405 | // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
|
---|
406 | $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
|
---|
407 | $pos += $utf_len;
|
---|
408 | $tmp_pos = $pos;
|
---|
409 | continue 2;
|
---|
410 | }
|
---|
411 | break;
|
---|
412 |
|
---|
413 | case "\xE0":
|
---|
414 | if ($utf_char <= "\xE0\x9F\xBF")
|
---|
415 | {
|
---|
416 | // Unicode char U+0000..U+07FF encoded in 3 bytes
|
---|
417 | $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
|
---|
418 | $pos += $utf_len;
|
---|
419 | $tmp_pos = $pos;
|
---|
420 | continue 2;
|
---|
421 | }
|
---|
422 | break;
|
---|
423 |
|
---|
424 | case "\xF0":
|
---|
425 | if ($utf_char <= "\xF0\x8F\xBF\xBF")
|
---|
426 | {
|
---|
427 | // Unicode char U+0000..U+FFFF encoded in 4 bytes
|
---|
428 | $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
|
---|
429 | $pos += $utf_len;
|
---|
430 | $tmp_pos = $pos;
|
---|
431 | continue 2;
|
---|
432 | }
|
---|
433 | break;
|
---|
434 |
|
---|
435 | default:
|
---|
436 | // Five- and six- byte sequences do not need being checked for here anymore
|
---|
437 | if ($utf_char > UTF8_MAX)
|
---|
438 | {
|
---|
439 | // Out of the Unicode range
|
---|
440 | if ($utf_char[0] < "\xF8")
|
---|
441 | {
|
---|
442 | $trailing_bytes = 3;
|
---|
443 | }
|
---|
444 | else if ($utf_char[0] < "\xFC")
|
---|
445 | {
|
---|
446 | $trailing_bytes = 4;
|
---|
447 | }
|
---|
448 | else if ($utf_char[0] > "\xFD")
|
---|
449 | {
|
---|
450 | $trailing_bytes = 0;
|
---|
451 | }
|
---|
452 | else
|
---|
453 | {
|
---|
454 | $trailing_bytes = 5;
|
---|
455 | }
|
---|
456 |
|
---|
457 | $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
|
---|
458 | $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
|
---|
459 | $tmp_pos = $pos;
|
---|
460 | continue 2;
|
---|
461 | }
|
---|
462 | break;
|
---|
463 | }
|
---|
464 | }
|
---|
465 |
|
---|
466 | // The char is a valid starter, move the cursor and go on
|
---|
467 | $pos += $utf_len;
|
---|
468 | continue;
|
---|
469 | }
|
---|
470 | }
|
---|
471 | else
|
---|
472 | {
|
---|
473 | // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if
|
---|
474 | // each of them was a Unicode replacement char
|
---|
475 | $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
|
---|
476 | $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
|
---|
477 |
|
---|
478 | $pos += $spn;
|
---|
479 | $tmp_pos = $pos;
|
---|
480 | continue;
|
---|
481 | }
|
---|
482 |
|
---|
483 |
|
---|
484 | // STEP 1: Decompose current char
|
---|
485 |
|
---|
486 | // We have found a character that is either:
|
---|
487 | // - in the NFC_QC/NFKC_QC list
|
---|
488 | // - a non-starter char that is not canonically ordered
|
---|
489 | //
|
---|
490 | // We are going to capture the shortest UTF sequence that satisfies these two conditions:
|
---|
491 | //
|
---|
492 | // 1 - If the sequence does not start at the begginning of the string, it must begin with a starter,
|
---|
493 | // and that starter must not have the NF[K]C_QC property equal to "MAYBE"
|
---|
494 | //
|
---|
495 | // 2 - If the sequence does not end at the end of the string, it must end with a non-starter and be
|
---|
496 | // immediately followed by a starter that is not on the QC list
|
---|
497 | //
|
---|
498 | $utf_seq = array();
|
---|
499 | $last_cc = 0;
|
---|
500 | $lpos = $pos;
|
---|
501 | $pos += $utf_len;
|
---|
502 |
|
---|
503 | if (isset($decomp_map[$utf_char]))
|
---|
504 | {
|
---|
505 | $_pos = 0;
|
---|
506 | $_len = strlen($decomp_map[$utf_char]);
|
---|
507 |
|
---|
508 | do
|
---|
509 | {
|
---|
510 | $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];
|
---|
511 |
|
---|
512 | if (isset($_utf_len))
|
---|
513 | {
|
---|
514 | $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
|
---|
515 | $_pos += $_utf_len;
|
---|
516 | }
|
---|
517 | else
|
---|
518 | {
|
---|
519 | $utf_seq[] = $decomp_map[$utf_char][$_pos];
|
---|
520 | ++$_pos;
|
---|
521 | }
|
---|
522 | }
|
---|
523 | while ($_pos < $_len);
|
---|
524 | }
|
---|
525 | else
|
---|
526 | {
|
---|
527 | // The char is not decomposable
|
---|
528 | $utf_seq = array($utf_char);
|
---|
529 | }
|
---|
530 |
|
---|
531 |
|
---|
532 | // STEP 2: Capture the starter
|
---|
533 |
|
---|
534 | // Check out the combining class of the first character of the UTF sequence
|
---|
535 | $k = 0;
|
---|
536 | if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)
|
---|
537 | {
|
---|
538 | // Not a starter, inspect previous characters
|
---|
539 | // The last 8 characters are kept in a buffer so that we don't have to capture them everytime.
|
---|
540 | // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,
|
---|
541 | // although it is slower than this method.
|
---|
542 | //
|
---|
543 | // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is
|
---|
544 | // at offset $i) and process them in backward mode until we find a starter.
|
---|
545 | //
|
---|
546 | // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more
|
---|
547 | // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering
|
---|
548 | $starter_found = 0;
|
---|
549 | $j_min = max(1, $i - 7);
|
---|
550 |
|
---|
551 | for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)
|
---|
552 | {
|
---|
553 | $utf_char = $buffer[$j & 7];
|
---|
554 | $lpos -= strlen($utf_char);
|
---|
555 |
|
---|
556 | if (isset($decomp_map[$utf_char]))
|
---|
557 | {
|
---|
558 | // The char is a composite, decompose for storage
|
---|
559 | $decomp_seq = array();
|
---|
560 | $_pos = 0;
|
---|
561 | $_len = strlen($decomp_map[$utf_char]);
|
---|
562 |
|
---|
563 | do
|
---|
564 | {
|
---|
565 | $c = $decomp_map[$utf_char][$_pos];
|
---|
566 | $_utf_len =& $utf_len_mask[$c & "\xF0"];
|
---|
567 |
|
---|
568 | if (isset($_utf_len))
|
---|
569 | {
|
---|
570 | $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
|
---|
571 | $_pos += $_utf_len;
|
---|
572 | }
|
---|
573 | else
|
---|
574 | {
|
---|
575 | $decomp_seq[] = $c;
|
---|
576 | ++$_pos;
|
---|
577 | }
|
---|
578 | }
|
---|
579 | while ($_pos < $_len);
|
---|
580 |
|
---|
581 | // Prepend the UTF sequence with our decomposed sequence
|
---|
582 | if (isset($decomp_seq[1]))
|
---|
583 | {
|
---|
584 | // The char expanded into several chars
|
---|
585 | $decomp_cnt = sizeof($decomp_seq);
|
---|
586 |
|
---|
587 | foreach ($decomp_seq as $decomp_i => $decomp_char)
|
---|
588 | {
|
---|
589 | $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;
|
---|
590 | }
|
---|
591 | $k -= $decomp_cnt;
|
---|
592 | }
|
---|
593 | else
|
---|
594 | {
|
---|
595 | // Decomposed to a single char, easier to prepend
|
---|
596 | $utf_seq[--$k] = $decomp_seq[0];
|
---|
597 | }
|
---|
598 | }
|
---|
599 | else
|
---|
600 | {
|
---|
601 | $utf_seq[--$k] = $utf_char;
|
---|
602 | }
|
---|
603 |
|
---|
604 | if (!isset($utf_combining_class[$utf_seq[$k]]))
|
---|
605 | {
|
---|
606 | // We have found our starter
|
---|
607 | $starter_found = 1;
|
---|
608 | break;
|
---|
609 | }
|
---|
610 | }
|
---|
611 |
|
---|
612 | if (!$starter_found && $lpos > $tmp_pos)
|
---|
613 | {
|
---|
614 | // The starter was not found in the buffer, let's rewind some more
|
---|
615 | do
|
---|
616 | {
|
---|
617 | // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.
|
---|
618 | $c = $str[--$lpos];
|
---|
619 | $c_mask = $c & "\xF0";
|
---|
620 |
|
---|
621 | if (isset($utf_len_mask[$c_mask]))
|
---|
622 | {
|
---|
623 | // UTF byte
|
---|
624 | if ($utf_len = $utf_len_mask[$c_mask])
|
---|
625 | {
|
---|
626 | // UTF *leading* byte
|
---|
627 | $utf_char = substr($str, $lpos, $utf_len);
|
---|
628 |
|
---|
629 | if (isset($decomp_map[$utf_char]))
|
---|
630 | {
|
---|
631 | // Decompose the character
|
---|
632 | $decomp_seq = array();
|
---|
633 | $_pos = 0;
|
---|
634 | $_len = strlen($decomp_map[$utf_char]);
|
---|
635 |
|
---|
636 | do
|
---|
637 | {
|
---|
638 | $c = $decomp_map[$utf_char][$_pos];
|
---|
639 | $_utf_len =& $utf_len_mask[$c & "\xF0"];
|
---|
640 |
|
---|
641 | if (isset($_utf_len))
|
---|
642 | {
|
---|
643 | $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
|
---|
644 | $_pos += $_utf_len;
|
---|
645 | }
|
---|
646 | else
|
---|
647 | {
|
---|
648 | $decomp_seq[] = $c;
|
---|
649 | ++$_pos;
|
---|
650 | }
|
---|
651 | }
|
---|
652 | while ($_pos < $_len);
|
---|
653 |
|
---|
654 | // Prepend the UTF sequence with our decomposed sequence
|
---|
655 | if (isset($decomp_seq[1]))
|
---|
656 | {
|
---|
657 | // The char expanded into several chars
|
---|
658 | $decomp_cnt = sizeof($decomp_seq);
|
---|
659 | foreach ($decomp_seq as $decomp_i => $utf_char)
|
---|
660 | {
|
---|
661 | $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;
|
---|
662 | }
|
---|
663 | $k -= $decomp_cnt;
|
---|
664 | }
|
---|
665 | else
|
---|
666 | {
|
---|
667 | // Decomposed to a single char, easier to prepend
|
---|
668 | $utf_seq[--$k] = $decomp_seq[0];
|
---|
669 | }
|
---|
670 | }
|
---|
671 | else
|
---|
672 | {
|
---|
673 | $utf_seq[--$k] = $utf_char;
|
---|
674 | }
|
---|
675 | }
|
---|
676 | }
|
---|
677 | else
|
---|
678 | {
|
---|
679 | // ASCII char
|
---|
680 | $utf_seq[--$k] = $c;
|
---|
681 | }
|
---|
682 | }
|
---|
683 | while ($lpos > $tmp_pos);
|
---|
684 | }
|
---|
685 | }
|
---|
686 |
|
---|
687 |
|
---|
688 | // STEP 3: Capture following combining modifiers
|
---|
689 |
|
---|
690 | while ($pos < $len)
|
---|
691 | {
|
---|
692 | $c_mask = $str[$pos] & "\xF0";
|
---|
693 |
|
---|
694 | if (isset($utf_len_mask[$c_mask]))
|
---|
695 | {
|
---|
696 | if ($utf_len = $utf_len_mask[$c_mask])
|
---|
697 | {
|
---|
698 | $utf_char = substr($str, $pos, $utf_len);
|
---|
699 | }
|
---|
700 | else
|
---|
701 | {
|
---|
702 | // A trailing byte came out of nowhere
|
---|
703 | // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop
|
---|
704 | // as if it was a starter (replacement chars ARE starters) and let the next loop replace it
|
---|
705 | break;
|
---|
706 | }
|
---|
707 |
|
---|
708 | if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char]))
|
---|
709 | {
|
---|
710 | // Combining character, add it to the sequence and move the cursor
|
---|
711 | if (isset($decomp_map[$utf_char]))
|
---|
712 | {
|
---|
713 | // Decompose the character
|
---|
714 | $_pos = 0;
|
---|
715 | $_len = strlen($decomp_map[$utf_char]);
|
---|
716 |
|
---|
717 | do
|
---|
718 | {
|
---|
719 | $c = $decomp_map[$utf_char][$_pos];
|
---|
720 | $_utf_len =& $utf_len_mask[$c & "\xF0"];
|
---|
721 |
|
---|
722 | if (isset($_utf_len))
|
---|
723 | {
|
---|
724 | $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
|
---|
725 | $_pos += $_utf_len;
|
---|
726 | }
|
---|
727 | else
|
---|
728 | {
|
---|
729 | $utf_seq[] = $c;
|
---|
730 | ++$_pos;
|
---|
731 | }
|
---|
732 | }
|
---|
733 | while ($_pos < $_len);
|
---|
734 | }
|
---|
735 | else
|
---|
736 | {
|
---|
737 | $utf_seq[] = $utf_char;
|
---|
738 | }
|
---|
739 |
|
---|
740 | $pos += $utf_len;
|
---|
741 | }
|
---|
742 | else
|
---|
743 | {
|
---|
744 | // Combining class 0 and no QC, break out of the loop
|
---|
745 | // Note: we do not know if that character is valid. If it's not, the next iteration will replace it
|
---|
746 | break;
|
---|
747 | }
|
---|
748 | }
|
---|
749 | else
|
---|
750 | {
|
---|
751 | // ASCII chars are starters
|
---|
752 | break;
|
---|
753 | }
|
---|
754 | }
|
---|
755 |
|
---|
756 |
|
---|
757 | // STEP 4: Sort and combine
|
---|
758 |
|
---|
759 | // Here we sort...
|
---|
760 | $k_max = $k + sizeof($utf_seq);
|
---|
761 |
|
---|
762 | if (!$k && $k_max == 1)
|
---|
763 | {
|
---|
764 | // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop
|
---|
765 | // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases
|
---|
766 | // if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))
|
---|
767 | // {
|
---|
768 | $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];
|
---|
769 | $tmp_pos = $pos;
|
---|
770 | // }
|
---|
771 |
|
---|
772 | continue;
|
---|
773 | }
|
---|
774 |
|
---|
775 | // ...there we combine
|
---|
776 | if (isset($utf_combining_class[$utf_seq[$k]]))
|
---|
777 | {
|
---|
778 | $starter = $nf_seq = '';
|
---|
779 | }
|
---|
780 | else
|
---|
781 | {
|
---|
782 | $starter = $utf_seq[$k++];
|
---|
783 | $nf_seq = '';
|
---|
784 | }
|
---|
785 | $utf_sort = array();
|
---|
786 |
|
---|
787 | // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine
|
---|
788 | // at the end of the string without altering it
|
---|
789 | $utf_seq[] = '';
|
---|
790 |
|
---|
791 | do
|
---|
792 | {
|
---|
793 | $utf_char = $utf_seq[$k++];
|
---|
794 |
|
---|
795 | if (isset($utf_combining_class[$utf_char]))
|
---|
796 | {
|
---|
797 | $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
|
---|
798 | }
|
---|
799 | else
|
---|
800 | {
|
---|
801 | if (empty($utf_sort))
|
---|
802 | {
|
---|
803 | // No combining characters... check for a composite of the two starters
|
---|
804 | if (isset($utf_canonical_comp[$starter . $utf_char]))
|
---|
805 | {
|
---|
806 | // Good ol' composite character
|
---|
807 | $starter = $utf_canonical_comp[$starter . $utf_char];
|
---|
808 | }
|
---|
809 | else if (isset($utf_jamo_type[$utf_char]))
|
---|
810 | {
|
---|
811 | // Current char is a composable jamo
|
---|
812 | if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)
|
---|
813 | {
|
---|
814 | // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo
|
---|
815 | if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)
|
---|
816 | {
|
---|
817 | // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)
|
---|
818 | $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];
|
---|
819 | ++$k;
|
---|
820 | }
|
---|
821 | else
|
---|
822 | {
|
---|
823 | // L+V jamos, combine to a LV Hangul syllable
|
---|
824 | $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];
|
---|
825 | }
|
---|
826 |
|
---|
827 | $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
|
---|
828 | }
|
---|
829 | else
|
---|
830 | {
|
---|
831 | // Non-composable jamo, just add it to the sequence
|
---|
832 | $nf_seq .= $starter;
|
---|
833 | $starter = $utf_char;
|
---|
834 | }
|
---|
835 | }
|
---|
836 | else
|
---|
837 | {
|
---|
838 | // No composite, just add the first starter to the sequence then continue with the other one
|
---|
839 | $nf_seq .= $starter;
|
---|
840 | $starter = $utf_char;
|
---|
841 | }
|
---|
842 | }
|
---|
843 | else
|
---|
844 | {
|
---|
845 | ksort($utf_sort);
|
---|
846 |
|
---|
847 | // For each class of combining characters
|
---|
848 | foreach ($utf_sort as $cc => $utf_chars)
|
---|
849 | {
|
---|
850 | $j = 0;
|
---|
851 |
|
---|
852 | do
|
---|
853 | {
|
---|
854 | // Look for a composite
|
---|
855 | if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))
|
---|
856 | {
|
---|
857 | // Found a composite, replace the starter
|
---|
858 | $starter = $utf_canonical_comp[$starter . $utf_chars[$j]];
|
---|
859 | unset($utf_sort[$cc][$j]);
|
---|
860 | }
|
---|
861 | else
|
---|
862 | {
|
---|
863 | // No composite, all following characters in that class are blocked
|
---|
864 | break;
|
---|
865 | }
|
---|
866 | }
|
---|
867 | while (isset($utf_sort[$cc][++$j]));
|
---|
868 | }
|
---|
869 |
|
---|
870 | // Add the starter to the normalized sequence, followed by non-starters in canonical order
|
---|
871 | $nf_seq .= $starter;
|
---|
872 |
|
---|
873 | foreach ($utf_sort as $utf_chars)
|
---|
874 | {
|
---|
875 | if (!empty($utf_chars))
|
---|
876 | {
|
---|
877 | $nf_seq .= implode('', $utf_chars);
|
---|
878 | }
|
---|
879 | }
|
---|
880 |
|
---|
881 | // Reset the array and go on
|
---|
882 | $utf_sort = array();
|
---|
883 | $starter = $utf_char;
|
---|
884 | }
|
---|
885 | }
|
---|
886 | }
|
---|
887 | while ($k <= $k_max);
|
---|
888 |
|
---|
889 | $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;
|
---|
890 | $tmp_pos = $pos;
|
---|
891 | }
|
---|
892 | else
|
---|
893 | {
|
---|
894 | // Only a ASCII char can make the program get here
|
---|
895 | //
|
---|
896 | // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().
|
---|
897 | //
|
---|
898 | // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on
|
---|
899 | // multi-byte text (where the only ASCII chars are spaces and punctuation)
|
---|
900 | if (++$pos != $len)
|
---|
901 | {
|
---|
902 | if ($str[$pos] < "\x80")
|
---|
903 | {
|
---|
904 | $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
|
---|
905 | $buffer[++$i & 7] = $str[$pos - 1];
|
---|
906 | }
|
---|
907 | else
|
---|
908 | {
|
---|
909 | $buffer[++$i & 7] = $c;
|
---|
910 | }
|
---|
911 | }
|
---|
912 | }
|
---|
913 | }
|
---|
914 | while ($pos < $len);
|
---|
915 |
|
---|
916 | // Now is time to return the string
|
---|
917 | if ($tmp_pos)
|
---|
918 | {
|
---|
919 | // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version
|
---|
920 | if ($tmp_pos == $len)
|
---|
921 | {
|
---|
922 | // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
|
---|
923 | return $tmp;
|
---|
924 | }
|
---|
925 | else
|
---|
926 | {
|
---|
927 | // The rightmost chunk of $str has not been appended to $tmp yet
|
---|
928 | return $tmp . substr($str, $tmp_pos);
|
---|
929 | }
|
---|
930 | }
|
---|
931 |
|
---|
932 | // The string was already in normal form
|
---|
933 | return $str;
|
---|
934 | }
|
---|
935 |
|
---|
936 | /**
|
---|
937 | * Decompose a UTF string
|
---|
938 | *
|
---|
939 | * @param string $str UTF string
|
---|
940 | * @param integer $pos Position of the first UTF char (in bytes)
|
---|
941 | * @param integer $len Length of the string (in bytes)
|
---|
942 | * @param array &$decomp_map Decomposition mapping, passed by reference but never modified
|
---|
943 | * @return string The string, decomposed and sorted canonically
|
---|
944 | *
|
---|
945 | * @access private
|
---|
946 | */
|
---|
947 | function decompose($str, $pos, $len, &$decomp_map)
|
---|
948 | {
|
---|
949 | global $utf_combining_class;
|
---|
950 |
|
---|
951 | // Load some commonly-used tables
|
---|
952 | if (!isset($utf_combining_class))
|
---|
953 | {
|
---|
954 | global $phpbb_root_path, $phpEx;
|
---|
955 | include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
|
---|
956 | }
|
---|
957 |
|
---|
958 | // UTF char length array
|
---|
959 | $utf_len_mask = array(
|
---|
960 | // Leading bytes masks
|
---|
961 | "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
|
---|
962 | // Trailing bytes masks
|
---|
963 | "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
|
---|
964 | );
|
---|
965 |
|
---|
966 | // Some extra checks are triggered on the first byte of a UTF sequence
|
---|
967 | $extra_check = array(
|
---|
968 | "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
|
---|
969 | "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
|
---|
970 | "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
|
---|
971 | );
|
---|
972 |
|
---|
973 | // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:
|
---|
974 | // - 2-byte: 110? ???? 10?? ????
|
---|
975 | // - 3-byte: 1110 ???? 10?? ???? 10?? ????
|
---|
976 | // - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
|
---|
977 | // Note that 5- and 6- byte sequences are automatically discarded
|
---|
978 | $utf_validation_mask = array(
|
---|
979 | 2 => "\xE0\xC0",
|
---|
980 | 3 => "\xF0\xC0\xC0",
|
---|
981 | 4 => "\xF8\xC0\xC0\xC0"
|
---|
982 | );
|
---|
983 |
|
---|
984 | $utf_validation_check = array(
|
---|
985 | 2 => "\xC0\x80",
|
---|
986 | 3 => "\xE0\x80\x80",
|
---|
987 | 4 => "\xF0\x80\x80\x80"
|
---|
988 | );
|
---|
989 |
|
---|
990 | $tmp = '';
|
---|
991 | $starter_pos = $pos;
|
---|
992 | $tmp_pos = $last_cc = $sort = $dump = 0;
|
---|
993 | $utf_sort = array();
|
---|
994 |
|
---|
995 |
|
---|
996 | // Main loop
|
---|
997 | do
|
---|
998 | {
|
---|
999 | // STEP 0: Capture the current char
|
---|
1000 |
|
---|
1001 | $cur_mask = $str[$pos] & "\xF0";
|
---|
1002 | if (isset($utf_len_mask[$cur_mask]))
|
---|
1003 | {
|
---|
1004 | if ($utf_len = $utf_len_mask[$cur_mask])
|
---|
1005 | {
|
---|
1006 | // Multibyte char
|
---|
1007 | $utf_char = substr($str, $pos, $utf_len);
|
---|
1008 | $pos += $utf_len;
|
---|
1009 | }
|
---|
1010 | else
|
---|
1011 | {
|
---|
1012 | // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode
|
---|
1013 | // replacement char and we will advance the cursor
|
---|
1014 | $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
|
---|
1015 |
|
---|
1016 | if ($dump)
|
---|
1017 | {
|
---|
1018 | $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
|
---|
1019 |
|
---|
1020 | // Dump combiners
|
---|
1021 | if (!empty($utf_sort))
|
---|
1022 | {
|
---|
1023 | if ($sort)
|
---|
1024 | {
|
---|
1025 | ksort($utf_sort);
|
---|
1026 | }
|
---|
1027 |
|
---|
1028 | foreach ($utf_sort as $utf_chars)
|
---|
1029 | {
|
---|
1030 | $tmp .= implode('', $utf_chars);
|
---|
1031 | }
|
---|
1032 | }
|
---|
1033 |
|
---|
1034 | $tmp .= str_repeat(UTF8_REPLACEMENT, $spn);
|
---|
1035 | $dump = $sort = 0;
|
---|
1036 | }
|
---|
1037 | else
|
---|
1038 | {
|
---|
1039 | $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
|
---|
1040 | }
|
---|
1041 |
|
---|
1042 | $pos += $spn;
|
---|
1043 | $tmp_pos = $starter_pos = $pos;
|
---|
1044 |
|
---|
1045 | $utf_sort = array();
|
---|
1046 | $last_cc = 0;
|
---|
1047 |
|
---|
1048 | continue;
|
---|
1049 | }
|
---|
1050 |
|
---|
1051 |
|
---|
1052 | // STEP 1: Decide what to do with current char
|
---|
1053 |
|
---|
1054 | // Now, in that order:
|
---|
1055 | // - check if that character is decomposable
|
---|
1056 | // - check if that character is a non-starter
|
---|
1057 | // - check if that character requires extra checks to be performed
|
---|
1058 | if (isset($decomp_map[$utf_char]))
|
---|
1059 | {
|
---|
1060 | // Decompose the char
|
---|
1061 | $_pos = 0;
|
---|
1062 | $_len = strlen($decomp_map[$utf_char]);
|
---|
1063 |
|
---|
1064 | do
|
---|
1065 | {
|
---|
1066 | $c = $decomp_map[$utf_char][$_pos];
|
---|
1067 | $_utf_len =& $utf_len_mask[$c & "\xF0"];
|
---|
1068 |
|
---|
1069 | if (isset($_utf_len))
|
---|
1070 | {
|
---|
1071 | $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);
|
---|
1072 | $_pos += $_utf_len;
|
---|
1073 |
|
---|
1074 | if (isset($utf_combining_class[$_utf_char]))
|
---|
1075 | {
|
---|
1076 | // The character decomposed to a non-starter, buffer it for sorting
|
---|
1077 | $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;
|
---|
1078 |
|
---|
1079 | if ($utf_combining_class[$_utf_char] < $last_cc)
|
---|
1080 | {
|
---|
1081 | // Not canonically ordered, will require sorting
|
---|
1082 | $sort = $dump = 1;
|
---|
1083 | }
|
---|
1084 | else
|
---|
1085 | {
|
---|
1086 | $dump = 1;
|
---|
1087 | $last_cc = $utf_combining_class[$_utf_char];
|
---|
1088 | }
|
---|
1089 | }
|
---|
1090 | else
|
---|
1091 | {
|
---|
1092 | // This character decomposition contains a starter, dump the buffer and continue
|
---|
1093 | if ($dump)
|
---|
1094 | {
|
---|
1095 | $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
|
---|
1096 |
|
---|
1097 | // Dump combiners
|
---|
1098 | if (!empty($utf_sort))
|
---|
1099 | {
|
---|
1100 | if ($sort)
|
---|
1101 | {
|
---|
1102 | ksort($utf_sort);
|
---|
1103 | }
|
---|
1104 |
|
---|
1105 | foreach ($utf_sort as $utf_chars)
|
---|
1106 | {
|
---|
1107 | $tmp .= implode('', $utf_chars);
|
---|
1108 | }
|
---|
1109 | }
|
---|
1110 |
|
---|
1111 | $tmp .= $_utf_char;
|
---|
1112 | $dump = $sort = 0;
|
---|
1113 | }
|
---|
1114 | else
|
---|
1115 | {
|
---|
1116 | $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;
|
---|
1117 | }
|
---|
1118 |
|
---|
1119 | $tmp_pos = $starter_pos = $pos;
|
---|
1120 | $utf_sort = array();
|
---|
1121 | $last_cc = 0;
|
---|
1122 | }
|
---|
1123 | }
|
---|
1124 | else
|
---|
1125 | {
|
---|
1126 | // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue
|
---|
1127 | ++$_pos;
|
---|
1128 |
|
---|
1129 | if ($dump)
|
---|
1130 | {
|
---|
1131 | $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
|
---|
1132 |
|
---|
1133 | // Dump combiners
|
---|
1134 | if (!empty($utf_sort))
|
---|
1135 | {
|
---|
1136 | if ($sort)
|
---|
1137 | {
|
---|
1138 | ksort($utf_sort);
|
---|
1139 | }
|
---|
1140 |
|
---|
1141 | foreach ($utf_sort as $utf_chars)
|
---|
1142 | {
|
---|
1143 | $tmp .= implode('', $utf_chars);
|
---|
1144 | }
|
---|
1145 | }
|
---|
1146 |
|
---|
1147 | $tmp .= $c;
|
---|
1148 | $dump = $sort = 0;
|
---|
1149 | }
|
---|
1150 | else
|
---|
1151 | {
|
---|
1152 | $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;
|
---|
1153 | }
|
---|
1154 |
|
---|
1155 | $tmp_pos = $starter_pos = $pos;
|
---|
1156 | $utf_sort = array();
|
---|
1157 | $last_cc = 0;
|
---|
1158 | }
|
---|
1159 | }
|
---|
1160 | while ($_pos < $_len);
|
---|
1161 | }
|
---|
1162 | else if (isset($utf_combining_class[$utf_char]))
|
---|
1163 | {
|
---|
1164 | // Combining character
|
---|
1165 | if ($utf_combining_class[$utf_char] < $last_cc)
|
---|
1166 | {
|
---|
1167 | // Not in canonical order
|
---|
1168 | $sort = $dump = 1;
|
---|
1169 | }
|
---|
1170 | else
|
---|
1171 | {
|
---|
1172 | $last_cc = $utf_combining_class[$utf_char];
|
---|
1173 | }
|
---|
1174 |
|
---|
1175 | $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
|
---|
1176 | }
|
---|
1177 | else
|
---|
1178 | {
|
---|
1179 | // Non-decomposable starter, check out if it's a Hangul syllable
|
---|
1180 | if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST)
|
---|
1181 | {
|
---|
1182 | // Nope, regular UTF char, check that we have the correct number of trailing bytes
|
---|
1183 | if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
|
---|
1184 | {
|
---|
1185 | // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
|
---|
1186 | // has been encoded in a five- or six- byte sequence.
|
---|
1187 | // Move the cursor back to its original position then advance it to the position it should really be at
|
---|
1188 | $pos -= $utf_len;
|
---|
1189 | $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
|
---|
1190 |
|
---|
1191 | if (!empty($utf_sort))
|
---|
1192 | {
|
---|
1193 | ksort($utf_sort);
|
---|
1194 |
|
---|
1195 | foreach ($utf_sort as $utf_chars)
|
---|
1196 | {
|
---|
1197 | $tmp .= implode('', $utf_chars);
|
---|
1198 | }
|
---|
1199 | $utf_sort = array();
|
---|
1200 | }
|
---|
1201 |
|
---|
1202 | // Add a replacement char then another replacement char for every trailing byte.
|
---|
1203 | //
|
---|
1204 | // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this
|
---|
1205 | $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);
|
---|
1206 | $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);
|
---|
1207 |
|
---|
1208 | $dump = $sort = 0;
|
---|
1209 |
|
---|
1210 | $pos += $spn;
|
---|
1211 | $tmp_pos = $pos;
|
---|
1212 | continue;
|
---|
1213 | }
|
---|
1214 |
|
---|
1215 | if (isset($extra_check[$utf_char[0]]))
|
---|
1216 | {
|
---|
1217 | switch ($utf_char[0])
|
---|
1218 | {
|
---|
1219 | // Note: 0xED is quite common in Korean
|
---|
1220 | case "\xED":
|
---|
1221 | if ($utf_char >= "\xED\xA0\x80")
|
---|
1222 | {
|
---|
1223 | // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
|
---|
1224 | $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
|
---|
1225 |
|
---|
1226 | if (!empty($utf_sort))
|
---|
1227 | {
|
---|
1228 | ksort($utf_sort);
|
---|
1229 |
|
---|
1230 | foreach ($utf_sort as $utf_chars)
|
---|
1231 | {
|
---|
1232 | $tmp .= implode('', $utf_chars);
|
---|
1233 | }
|
---|
1234 | $utf_sort = array();
|
---|
1235 | }
|
---|
1236 |
|
---|
1237 | $tmp .= UTF8_REPLACEMENT;
|
---|
1238 | $dump = $sort = 0;
|
---|
1239 |
|
---|
1240 | $tmp_pos = $starter_pos = $pos;
|
---|
1241 | continue 2;
|
---|
1242 | }
|
---|
1243 | break;
|
---|
1244 |
|
---|
1245 | // Note: 0xEF is quite common in Japanese
|
---|
1246 | case "\xEF":
|
---|
1247 | if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
|
---|
1248 | {
|
---|
1249 | // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
|
---|
1250 | $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
|
---|
1251 |
|
---|
1252 | if (!empty($utf_sort))
|
---|
1253 | {
|
---|
1254 | ksort($utf_sort);
|
---|
1255 |
|
---|
1256 | foreach ($utf_sort as $utf_chars)
|
---|
1257 | {
|
---|
1258 | $tmp .= implode('', $utf_chars);
|
---|
1259 | }
|
---|
1260 | $utf_sort = array();
|
---|
1261 | }
|
---|
1262 |
|
---|
1263 | $tmp .= UTF8_REPLACEMENT;
|
---|
1264 | $dump = $sort = 0;
|
---|
1265 |
|
---|
1266 | $tmp_pos = $starter_pos = $pos;
|
---|
1267 | continue 2;
|
---|
1268 | }
|
---|
1269 | break;
|
---|
1270 |
|
---|
1271 | case "\xC0":
|
---|
1272 | case "\xC1":
|
---|
1273 | if ($utf_char <= "\xC1\xBF")
|
---|
1274 | {
|
---|
1275 | // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
|
---|
1276 | $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
|
---|
1277 |
|
---|
1278 | if (!empty($utf_sort))
|
---|
1279 | {
|
---|
1280 | ksort($utf_sort);
|
---|
1281 |
|
---|
1282 | foreach ($utf_sort as $utf_chars)
|
---|
1283 | {
|
---|
1284 | $tmp .= implode('', $utf_chars);
|
---|
1285 | }
|
---|
1286 | $utf_sort = array();
|
---|
1287 | }
|
---|
1288 |
|
---|
1289 | $tmp .= UTF8_REPLACEMENT;
|
---|
1290 | $dump = $sort = 0;
|
---|
1291 |
|
---|
1292 | $tmp_pos = $starter_pos = $pos;
|
---|
1293 | continue 2;
|
---|
1294 | }
|
---|
1295 | break;
|
---|
1296 |
|
---|
1297 | case "\xE0":
|
---|
1298 | if ($utf_char <= "\xE0\x9F\xBF")
|
---|
1299 | {
|
---|
1300 | // Unicode char U+0000..U+07FF encoded in 3 bytes
|
---|
1301 | $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
|
---|
1302 |
|
---|
1303 | if (!empty($utf_sort))
|
---|
1304 | {
|
---|
1305 | ksort($utf_sort);
|
---|
1306 |
|
---|
1307 | foreach ($utf_sort as $utf_chars)
|
---|
1308 | {
|
---|
1309 | $tmp .= implode('', $utf_chars);
|
---|
1310 | }
|
---|
1311 | $utf_sort = array();
|
---|
1312 | }
|
---|
1313 |
|
---|
1314 | $tmp .= UTF8_REPLACEMENT;
|
---|
1315 | $dump = $sort = 0;
|
---|
1316 |
|
---|
1317 | $tmp_pos = $starter_pos = $pos;
|
---|
1318 | continue 2;
|
---|
1319 | }
|
---|
1320 | break;
|
---|
1321 |
|
---|
1322 | case "\xF0":
|
---|
1323 | if ($utf_char <= "\xF0\x8F\xBF\xBF")
|
---|
1324 | {
|
---|
1325 | // Unicode char U+0000..U+FFFF encoded in 4 bytes
|
---|
1326 | $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
|
---|
1327 |
|
---|
1328 | if (!empty($utf_sort))
|
---|
1329 | {
|
---|
1330 | ksort($utf_sort);
|
---|
1331 |
|
---|
1332 | foreach ($utf_sort as $utf_chars)
|
---|
1333 | {
|
---|
1334 | $tmp .= implode('', $utf_chars);
|
---|
1335 | }
|
---|
1336 | $utf_sort = array();
|
---|
1337 | }
|
---|
1338 |
|
---|
1339 | $tmp .= UTF8_REPLACEMENT;
|
---|
1340 | $dump = $sort = 0;
|
---|
1341 |
|
---|
1342 | $tmp_pos = $starter_pos = $pos;
|
---|
1343 | continue 2;
|
---|
1344 | }
|
---|
1345 | break;
|
---|
1346 |
|
---|
1347 | default:
|
---|
1348 | if ($utf_char > UTF8_MAX)
|
---|
1349 | {
|
---|
1350 | // Out of the Unicode range
|
---|
1351 | $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
|
---|
1352 |
|
---|
1353 | if (!empty($utf_sort))
|
---|
1354 | {
|
---|
1355 | ksort($utf_sort);
|
---|
1356 |
|
---|
1357 | foreach ($utf_sort as $utf_chars)
|
---|
1358 | {
|
---|
1359 | $tmp .= implode('', $utf_chars);
|
---|
1360 | }
|
---|
1361 | $utf_sort = array();
|
---|
1362 | }
|
---|
1363 |
|
---|
1364 | $tmp .= UTF8_REPLACEMENT;
|
---|
1365 | $dump = $sort = 0;
|
---|
1366 |
|
---|
1367 | $tmp_pos = $starter_pos = $pos;
|
---|
1368 | continue 2;
|
---|
1369 | }
|
---|
1370 | break;
|
---|
1371 | }
|
---|
1372 | }
|
---|
1373 | }
|
---|
1374 | else
|
---|
1375 | {
|
---|
1376 | // Hangul syllable
|
---|
1377 | $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;
|
---|
1378 |
|
---|
1379 | // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
|
---|
1380 | //
|
---|
1381 | // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
|
---|
1382 | if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)
|
---|
1383 | {
|
---|
1384 | if ($t_index < 25)
|
---|
1385 | {
|
---|
1386 | $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
|
---|
1387 | $utf_char[8] = chr(0xA7 + $t_index);
|
---|
1388 | }
|
---|
1389 | else
|
---|
1390 | {
|
---|
1391 | $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
|
---|
1392 | $utf_char[8] = chr(0x67 + $t_index);
|
---|
1393 | }
|
---|
1394 | }
|
---|
1395 | else
|
---|
1396 | {
|
---|
1397 | $utf_char = "\xE1\x84\x00\xE1\x85\x00";
|
---|
1398 | }
|
---|
1399 |
|
---|
1400 | $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));
|
---|
1401 | $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));
|
---|
1402 |
|
---|
1403 | // Just like other decompositions, the resulting Jamos must be dumped to the tmp string
|
---|
1404 | $dump = 1;
|
---|
1405 | }
|
---|
1406 |
|
---|
1407 | // Do we need to dump stuff to the tmp string?
|
---|
1408 | if ($dump)
|
---|
1409 | {
|
---|
1410 | $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
|
---|
1411 |
|
---|
1412 | // Dump combiners
|
---|
1413 | if (!empty($utf_sort))
|
---|
1414 | {
|
---|
1415 | if ($sort)
|
---|
1416 | {
|
---|
1417 | ksort($utf_sort);
|
---|
1418 | }
|
---|
1419 |
|
---|
1420 | foreach ($utf_sort as $utf_chars)
|
---|
1421 | {
|
---|
1422 | $tmp .= implode('', $utf_chars);
|
---|
1423 | }
|
---|
1424 | }
|
---|
1425 |
|
---|
1426 | $tmp .= $utf_char;
|
---|
1427 | $dump = $sort = 0;
|
---|
1428 | $tmp_pos = $pos;
|
---|
1429 | }
|
---|
1430 |
|
---|
1431 | $last_cc = 0;
|
---|
1432 | $utf_sort = array();
|
---|
1433 | $starter_pos = $pos;
|
---|
1434 | }
|
---|
1435 | }
|
---|
1436 | else
|
---|
1437 | {
|
---|
1438 | // ASCII char, which happens to be a starter (as any other ASCII char)
|
---|
1439 | if ($dump)
|
---|
1440 | {
|
---|
1441 | $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
|
---|
1442 |
|
---|
1443 | // Dump combiners
|
---|
1444 | if (!empty($utf_sort))
|
---|
1445 | {
|
---|
1446 | if ($sort)
|
---|
1447 | {
|
---|
1448 | ksort($utf_sort);
|
---|
1449 | }
|
---|
1450 |
|
---|
1451 | foreach ($utf_sort as $utf_chars)
|
---|
1452 | {
|
---|
1453 | $tmp .= implode('', $utf_chars);
|
---|
1454 | }
|
---|
1455 | }
|
---|
1456 |
|
---|
1457 | $tmp .= $str[$pos];
|
---|
1458 | $dump = $sort = 0;
|
---|
1459 | $tmp_pos = ++$pos;
|
---|
1460 |
|
---|
1461 | $pos += strspn($str, UTF8_ASCII_RANGE, $pos);
|
---|
1462 | }
|
---|
1463 | else
|
---|
1464 | {
|
---|
1465 | $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
|
---|
1466 | }
|
---|
1467 |
|
---|
1468 | $last_cc = 0;
|
---|
1469 | $utf_sort = array();
|
---|
1470 | $starter_pos = $pos;
|
---|
1471 | }
|
---|
1472 | }
|
---|
1473 | while ($pos < $len);
|
---|
1474 |
|
---|
1475 | // Now is time to return the string
|
---|
1476 | if ($dump)
|
---|
1477 | {
|
---|
1478 | $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
|
---|
1479 |
|
---|
1480 | // Dump combiners
|
---|
1481 | if (!empty($utf_sort))
|
---|
1482 | {
|
---|
1483 | if ($sort)
|
---|
1484 | {
|
---|
1485 | ksort($utf_sort);
|
---|
1486 | }
|
---|
1487 |
|
---|
1488 | foreach ($utf_sort as $utf_chars)
|
---|
1489 | {
|
---|
1490 | $tmp .= implode('', $utf_chars);
|
---|
1491 | }
|
---|
1492 | }
|
---|
1493 |
|
---|
1494 | return $tmp;
|
---|
1495 | }
|
---|
1496 | else if ($tmp_pos)
|
---|
1497 | {
|
---|
1498 | // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version
|
---|
1499 | if ($tmp_pos == $len)
|
---|
1500 | {
|
---|
1501 | // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
|
---|
1502 | return $tmp;
|
---|
1503 | }
|
---|
1504 | else
|
---|
1505 | {
|
---|
1506 | // The rightmost chunk of $str has not been appended to $tmp yet
|
---|
1507 | return $tmp . substr($str, $tmp_pos);
|
---|
1508 | }
|
---|
1509 | }
|
---|
1510 |
|
---|
1511 | // The string was already in normal form
|
---|
1512 | return $str;
|
---|
1513 | }
|
---|
1514 | }
|
---|
1515 |
|
---|
1516 | ?>
|
---|