Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: branches/posledni/forum/includes/utf/utf_normalizer.php

Last change on this file was 400, checked in by george, 17 years ago
Přidáno: Nové forum phpBB 3.
File size: 41.8 KB

Line
1	<?php
2	/**
3	*
4	* @package utf
5	* @version $Id: utf_normalizer.php 8479 2008-03-29 00:22:48Z naderman $
6	* @copyright (c) 2005 phpBB Group
7	* @license http://opensource.org/licenses/gpl-license.php GNU Public License
8	*
9	*/
10
11	/**
12	*/
13	if (!defined('IN_PHPBB'))
14	{
15	exit;
16	}
17
18	/**
19	* Some Unicode characters encoded in UTF-8
20	*
21	* Preserved for compatibility
22	*/
23	define('UTF8_REPLACEMENT', "\xEF\xBF\xBD");
24	define('UTF8_MAX', "\xF4\x8F\xBF\xBF");
25	define('UTF8_FFFE', "\xEF\xBF\xBE");
26	define('UTF8_FFFF', "\xEF\xBF\xBF");
27	define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80");
28	define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");
29	define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");
30	define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");
31
32	define('UTF8_CJK_FIRST', "\xE4\xB8\x80");
33	define('UTF8_CJK_LAST', "\xE9\xBE\xBB");
34	define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");
35	define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");
36
37	// Unset global variables
38	unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
39
40	// NFC_QC and NFKC_QC values
41	define('UNICODE_QC_MAYBE', 0);
42	define('UNICODE_QC_NO', 1);
43
44	// Contains all the ASCII characters appearing in UTF-8, sorted by frequency
45	define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F");
46
47	// Contains all the tail bytes that can appear in the composition of a UTF-8 char
48	define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A");
49
50	// Constants used by the Hangul [de]composition algorithms
51	define('UNICODE_HANGUL_SBASE', 0xAC00);
52	define('UNICODE_HANGUL_LBASE', 0x1100);
53	define('UNICODE_HANGUL_VBASE', 0x1161);
54	define('UNICODE_HANGUL_TBASE', 0x11A7);
55	define('UNICODE_HANGUL_SCOUNT', 11172);
56	define('UNICODE_HANGUL_LCOUNT', 19);
57	define('UNICODE_HANGUL_VCOUNT', 21);
58	define('UNICODE_HANGUL_TCOUNT', 28);
59	define('UNICODE_HANGUL_NCOUNT', 588);
60	define('UNICODE_JAMO_L', 0);
61	define('UNICODE_JAMO_V', 1);
62	define('UNICODE_JAMO_T', 2);
63
64	/**
65	* Unicode normalization routines
66	*
67	* @package utf
68	*/
69	class utf_normalizer
70	{
71	/**
72	* Validate, cleanup and normalize a string
73	*
74	* The ultimate convenience function! Clean up invalid UTF-8 sequences,
75	* and convert to Normal Form C, canonical composition.
76	*
77	* @param string &$str The dirty string
78	* @return string The same string, all shiny and cleaned-up
79	*/
80	function cleanup(&$str)
81	{
82	// The string below is the list of all autorized characters, sorted by frequency in latin text
83	$pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
84	$len = strlen($str);
85
86	if ($pos == $len)
87	{
88	// ASCII strings with no special chars return immediately
89	return;
90	}
91
92	// Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
93	if (!isset($GLOBALS['utf_nfc_qc']))
94	{
95	global $phpbb_root_path, $phpEx;
96	include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
97	}
98
99	if (!isset($GLOBALS['utf_canonical_decomp']))
100	{
101	global $phpbb_root_path, $phpEx;
102	include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
103	}
104
105	// Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
106	// We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
107	$str = strtr(
108	$str,
109	"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
110	"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
111	);
112
113	$str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
114	}
115
116	/**
117	* Validate and normalize a UTF string to NFC
118	*
119	* @param string &$str Unchecked UTF string
120	* @return string The string, validated and in normal form
121	*/
122	function nfc(&$str)
123	{
124	$pos = strspn($str, UTF8_ASCII_RANGE);
125	$len = strlen($str);
126
127	if ($pos == $len)
128	{
129	// ASCII strings return immediately
130	return;
131	}
132
133	if (!isset($GLOBALS['utf_nfc_qc']))
134	{
135	global $phpbb_root_path, $phpEx;
136	include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
137	}
138
139	if (!isset($GLOBALS['utf_canonical_decomp']))
140	{
141	global $phpbb_root_path, $phpEx;
142	include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
143	}
144
145	$str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
146	}
147
148	/**
149	* Validate and normalize a UTF string to NFKC
150	*
151	* @param string &$str Unchecked UTF string
152	* @return string The string, validated and in normal form
153	*/
154	function nfkc(&$str)
155	{
156	$pos = strspn($str, UTF8_ASCII_RANGE);
157	$len = strlen($str);
158
159	if ($pos == $len)
160	{
161	// ASCII strings return immediately
162	return;
163	}
164
165	if (!isset($GLOBALS['utf_nfkc_qc']))
166	{
167	global $phpbb_root_path, $phpEx;
168	include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);
169	}
170
171	if (!isset($GLOBALS['utf_compatibility_decomp']))
172	{
173	global $phpbb_root_path, $phpEx;
174	include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
175	}
176
177	$str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
178	}
179
180	/**
181	* Validate and normalize a UTF string to NFD
182	*
183	* @param string &$str Unchecked UTF string
184	* @return string The string, validated and in normal form
185	*/
186	function nfd(&$str)
187	{
188	$pos = strspn($str, UTF8_ASCII_RANGE);
189	$len = strlen($str);
190
191	if ($pos == $len)
192	{
193	// ASCII strings return immediately
194	return;
195	}
196
197	if (!isset($GLOBALS['utf_canonical_decomp']))
198	{
199	global $phpbb_root_path, $phpEx;
200	include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
201	}
202
203	$str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
204	}
205
206	/**
207	* Validate and normalize a UTF string to NFKD
208	*
209	* @param string &$str Unchecked UTF string
210	* @return string The string, validated and in normal form
211	*/
212	function nfkd(&$str)
213	{
214	$pos = strspn($str, UTF8_ASCII_RANGE);
215	$len = strlen($str);
216
217	if ($pos == $len)
218	{
219	// ASCII strings return immediately
220	return;
221	}
222
223	if (!isset($GLOBALS['utf_compatibility_decomp']))
224	{
225	global $phpbb_root_path, $phpEx;
226	include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
227	}
228
229	$str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
230	}
231
232
233	/**
234	* Recompose a UTF string
235	*
236	* @param string $str Unchecked UTF string
237	* @param integer $pos Position of the first UTF char (in bytes)
238	* @param integer $len Length of the string (in bytes)
239	* @param array &$qc Quick-check array, passed by reference but never modified
240	* @param array &$decomp_map Decomposition mapping, passed by reference but never modified
241	* @return string The string, validated and recomposed
242	*
243	* @access private
244	*/
245	function recompose($str, $pos, $len, &$qc, &$decomp_map)
246	{
247	global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index;
248
249	// Load some commonly-used tables
250	if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class))
251	{
252	global $phpbb_root_path, $phpEx;
253	include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
254	}
255
256	// Load the canonical composition table
257	if (!isset($utf_canonical_comp))
258	{
259	global $phpbb_root_path, $phpEx;
260	include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
261	}
262
263	// Buffer the last ASCII char before the UTF-8 stuff if applicable
264	$tmp = '';
265	$i = $tmp_pos = $last_cc = 0;
266
267	$buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();
268
269	// UTF char length array
270	// This array is used to determine the length of a UTF character.
271	// Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos
272	// the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.
273	// Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character
274	// whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
275	$utf_len_mask = array(
276	// Leading bytes masks
277	"\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
278	// Trailing bytes masks
279	"\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
280	);
281
282	$extra_check = array(
283	"\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
284	"\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
285	"\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
286	);
287
288	$utf_validation_mask = array(
289	2 => "\xE0\xC0",
290	3 => "\xF0\xC0\xC0",
291	4 => "\xF8\xC0\xC0\xC0"
292	);
293
294	$utf_validation_check = array(
295	2 => "\xC0\x80",
296	3 => "\xE0\x80\x80",
297	4 => "\xF0\x80\x80\x80"
298	);
299
300	// Main loop
301	do
302	{
303	// STEP 0: Capture the current char and buffer it
304	$c = $str[$pos];
305	$c_mask = $c & "\xF0";
306
307	if (isset($utf_len_mask[$c_mask]))
308	{
309	// Byte at $pos is either a leading byte or a missplaced trailing byte
310	if ($utf_len = $utf_len_mask[$c_mask])
311	{
312	// Capture the char
313	$buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len);
314
315	// Let's find out if a thorough check is needed
316	if (isset($qc[$utf_char]))
317	{
318	// If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block
319	}
320	else if (isset($utf_combining_class[$utf_char]))
321	{
322	if ($utf_combining_class[$utf_char] < $last_cc)
323	{
324	// A combining character that is NOT canonically ordered
325	}
326	else
327	{
328	// A combining character that IS canonically ordered, skip to the next char
329	$last_cc = $utf_combining_class[$utf_char];
330
331	$pos += $utf_len;
332	continue;
333	}
334	}
335	else
336	{
337	// At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.
338	// It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out
339	$last_cc = 0;
340
341	// Check that we have the correct number of trailing bytes
342	if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
343	{
344	// Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
345	// has been encoded in a five- or six- byte sequence
346	if ($utf_char[0] >= "\xF8")
347	{
348	if ($utf_char[0] < "\xFC")
349	{
350	$trailing_bytes = 4;
351	}
352	else if ($utf_char[0] > "\xFD")
353	{
354	$trailing_bytes = 0;
355	}
356	else
357	{
358	$trailing_bytes = 5;
359	}
360	}
361	else
362	{
363	$trailing_bytes = $utf_len - 1;
364	}
365
366	$tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
367	$pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
368	$tmp_pos = $pos;
369
370	continue;
371	}
372
373	if (isset($extra_check[$c]))
374	{
375	switch ($c)
376	{
377	// Note: 0xED is quite common in Korean
378	case "\xED":
379	if ($utf_char >= "\xED\xA0\x80")
380	{
381	// Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
382	$tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
383	$pos += $utf_len;
384	$tmp_pos = $pos;
385	continue 2;
386	}
387	break;
388
389	// Note: 0xEF is quite common in Japanese
390	case "\xEF":
391	if ($utf_char == "\xEF\xBF\xBE" \|\| $utf_char == "\xEF\xBF\xBF")
392	{
393	// U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
394	$tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
395	$pos += $utf_len;
396	$tmp_pos = $pos;
397	continue 2;
398	}
399	break;
400
401	case "\xC0":
402	case "\xC1":
403	if ($utf_char <= "\xC1\xBF")
404	{
405	// Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
406	$tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
407	$pos += $utf_len;
408	$tmp_pos = $pos;
409	continue 2;
410	}
411	break;
412
413	case "\xE0":
414	if ($utf_char <= "\xE0\x9F\xBF")
415	{
416	// Unicode char U+0000..U+07FF encoded in 3 bytes
417	$tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
418	$pos += $utf_len;
419	$tmp_pos = $pos;
420	continue 2;
421	}
422	break;
423
424	case "\xF0":
425	if ($utf_char <= "\xF0\x8F\xBF\xBF")
426	{
427	// Unicode char U+0000..U+FFFF encoded in 4 bytes
428	$tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
429	$pos += $utf_len;
430	$tmp_pos = $pos;
431	continue 2;
432	}
433	break;
434
435	default:
436	// Five- and six- byte sequences do not need being checked for here anymore
437	if ($utf_char > UTF8_MAX)
438	{
439	// Out of the Unicode range
440	if ($utf_char[0] < "\xF8")
441	{
442	$trailing_bytes = 3;
443	}
444	else if ($utf_char[0] < "\xFC")
445	{
446	$trailing_bytes = 4;
447	}
448	else if ($utf_char[0] > "\xFD")
449	{
450	$trailing_bytes = 0;
451	}
452	else
453	{
454	$trailing_bytes = 5;
455	}
456
457	$tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
458	$pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
459	$tmp_pos = $pos;
460	continue 2;
461	}
462	break;
463	}
464	}
465
466	// The char is a valid starter, move the cursor and go on
467	$pos += $utf_len;
468	continue;
469	}
470	}
471	else
472	{
473	// A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if
474	// each of them was a Unicode replacement char
475	$spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
476	$tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
477
478	$pos += $spn;
479	$tmp_pos = $pos;
480	continue;
481	}
482
483
484	// STEP 1: Decompose current char
485
486	// We have found a character that is either:
487	// - in the NFC_QC/NFKC_QC list
488	// - a non-starter char that is not canonically ordered
489	//
490	// We are going to capture the shortest UTF sequence that satisfies these two conditions:
491	//
492	// 1 - If the sequence does not start at the begginning of the string, it must begin with a starter,
493	// and that starter must not have the NF[K]C_QC property equal to "MAYBE"
494	//
495	// 2 - If the sequence does not end at the end of the string, it must end with a non-starter and be
496	// immediately followed by a starter that is not on the QC list
497	//
498	$utf_seq = array();
499	$last_cc = 0;
500	$lpos = $pos;
501	$pos += $utf_len;
502
503	if (isset($decomp_map[$utf_char]))
504	{
505	$_pos = 0;
506	$_len = strlen($decomp_map[$utf_char]);
507
508	do
509	{
510	$_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];
511
512	if (isset($_utf_len))
513	{
514	$utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
515	$_pos += $_utf_len;
516	}
517	else
518	{
519	$utf_seq[] = $decomp_map[$utf_char][$_pos];
520	++$_pos;
521	}
522	}
523	while ($_pos < $_len);
524	}
525	else
526	{
527	// The char is not decomposable
528	$utf_seq = array($utf_char);
529	}
530
531
532	// STEP 2: Capture the starter
533
534	// Check out the combining class of the first character of the UTF sequence
535	$k = 0;
536	if (isset($utf_combining_class[$utf_seq[0]]) \|\| $qc[$utf_char] == UNICODE_QC_MAYBE)
537	{
538	// Not a starter, inspect previous characters
539	// The last 8 characters are kept in a buffer so that we don't have to capture them everytime.
540	// This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,
541	// although it is slower than this method.
542	//
543	// In the following loop, $j starts at the previous buffered character ($i - 1, because current character is
544	// at offset $i) and process them in backward mode until we find a starter.
545	//
546	// $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more
547	// characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering
548	$starter_found = 0;
549	$j_min = max(1, $i - 7);
550
551	for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)
552	{
553	$utf_char = $buffer[$j & 7];
554	$lpos -= strlen($utf_char);
555
556	if (isset($decomp_map[$utf_char]))
557	{
558	// The char is a composite, decompose for storage
559	$decomp_seq = array();
560	$_pos = 0;
561	$_len = strlen($decomp_map[$utf_char]);
562
563	do
564	{
565	$c = $decomp_map[$utf_char][$_pos];
566	$_utf_len =& $utf_len_mask[$c & "\xF0"];
567
568	if (isset($_utf_len))
569	{
570	$decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
571	$_pos += $_utf_len;
572	}
573	else
574	{
575	$decomp_seq[] = $c;
576	++$_pos;
577	}
578	}
579	while ($_pos < $_len);
580
581	// Prepend the UTF sequence with our decomposed sequence
582	if (isset($decomp_seq[1]))
583	{
584	// The char expanded into several chars
585	$decomp_cnt = sizeof($decomp_seq);
586
587	foreach ($decomp_seq as $decomp_i => $decomp_char)
588	{
589	$utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;
590	}
591	$k -= $decomp_cnt;
592	}
593	else
594	{
595	// Decomposed to a single char, easier to prepend
596	$utf_seq[--$k] = $decomp_seq[0];
597	}
598	}
599	else
600	{
601	$utf_seq[--$k] = $utf_char;
602	}
603
604	if (!isset($utf_combining_class[$utf_seq[$k]]))
605	{
606	// We have found our starter
607	$starter_found = 1;
608	break;
609	}
610	}
611
612	if (!$starter_found && $lpos > $tmp_pos)
613	{
614	// The starter was not found in the buffer, let's rewind some more
615	do
616	{
617	// $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.
618	$c = $str[--$lpos];
619	$c_mask = $c & "\xF0";
620
621	if (isset($utf_len_mask[$c_mask]))
622	{
623	// UTF byte
624	if ($utf_len = $utf_len_mask[$c_mask])
625	{
626	// UTF leading byte
627	$utf_char = substr($str, $lpos, $utf_len);
628
629	if (isset($decomp_map[$utf_char]))
630	{
631	// Decompose the character
632	$decomp_seq = array();
633	$_pos = 0;
634	$_len = strlen($decomp_map[$utf_char]);
635
636	do
637	{
638	$c = $decomp_map[$utf_char][$_pos];
639	$_utf_len =& $utf_len_mask[$c & "\xF0"];
640
641	if (isset($_utf_len))
642	{
643	$decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
644	$_pos += $_utf_len;
645	}
646	else
647	{
648	$decomp_seq[] = $c;
649	++$_pos;
650	}
651	}
652	while ($_pos < $_len);
653
654	// Prepend the UTF sequence with our decomposed sequence
655	if (isset($decomp_seq[1]))
656	{
657	// The char expanded into several chars
658	$decomp_cnt = sizeof($decomp_seq);
659	foreach ($decomp_seq as $decomp_i => $utf_char)
660	{
661	$utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;
662	}
663	$k -= $decomp_cnt;
664	}
665	else
666	{
667	// Decomposed to a single char, easier to prepend
668	$utf_seq[--$k] = $decomp_seq[0];
669	}
670	}
671	else
672	{
673	$utf_seq[--$k] = $utf_char;
674	}
675	}
676	}
677	else
678	{
679	// ASCII char
680	$utf_seq[--$k] = $c;
681	}
682	}
683	while ($lpos > $tmp_pos);
684	}
685	}
686
687
688	// STEP 3: Capture following combining modifiers
689
690	while ($pos < $len)
691	{
692	$c_mask = $str[$pos] & "\xF0";
693
694	if (isset($utf_len_mask[$c_mask]))
695	{
696	if ($utf_len = $utf_len_mask[$c_mask])
697	{
698	$utf_char = substr($str, $pos, $utf_len);
699	}
700	else
701	{
702	// A trailing byte came out of nowhere
703	// Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop
704	// as if it was a starter (replacement chars ARE starters) and let the next loop replace it
705	break;
706	}
707
708	if (isset($utf_combining_class[$utf_char]) \|\| isset($qc[$utf_char]))
709	{
710	// Combining character, add it to the sequence and move the cursor
711	if (isset($decomp_map[$utf_char]))
712	{
713	// Decompose the character
714	$_pos = 0;
715	$_len = strlen($decomp_map[$utf_char]);
716
717	do
718	{
719	$c = $decomp_map[$utf_char][$_pos];
720	$_utf_len =& $utf_len_mask[$c & "\xF0"];
721
722	if (isset($_utf_len))
723	{
724	$utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
725	$_pos += $_utf_len;
726	}
727	else
728	{
729	$utf_seq[] = $c;
730	++$_pos;
731	}
732	}
733	while ($_pos < $_len);
734	}
735	else
736	{
737	$utf_seq[] = $utf_char;
738	}
739
740	$pos += $utf_len;
741	}
742	else
743	{
744	// Combining class 0 and no QC, break out of the loop
745	// Note: we do not know if that character is valid. If it's not, the next iteration will replace it
746	break;
747	}
748	}
749	else
750	{
751	// ASCII chars are starters
752	break;
753	}
754	}
755
756
757	// STEP 4: Sort and combine
758
759	// Here we sort...
760	$k_max = $k + sizeof($utf_seq);
761
762	if (!$k && $k_max == 1)
763	{
764	// There is only one char in the UTF sequence, add it then jump to the next iteration of main loop
765	// Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases
766	// if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))
767	// {
768	$tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];
769	$tmp_pos = $pos;
770	// }
771
772	continue;
773	}
774
775	// ...there we combine
776	if (isset($utf_combining_class[$utf_seq[$k]]))
777	{
778	$starter = $nf_seq = '';
779	}
780	else
781	{
782	$starter = $utf_seq[$k++];
783	$nf_seq = '';
784	}
785	$utf_sort = array();
786
787	// We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine
788	// at the end of the string without altering it
789	$utf_seq[] = '';
790
791	do
792	{
793	$utf_char = $utf_seq[$k++];
794
795	if (isset($utf_combining_class[$utf_char]))
796	{
797	$utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
798	}
799	else
800	{
801	if (empty($utf_sort))
802	{
803	// No combining characters... check for a composite of the two starters
804	if (isset($utf_canonical_comp[$starter . $utf_char]))
805	{
806	// Good ol' composite character
807	$starter = $utf_canonical_comp[$starter . $utf_char];
808	}
809	else if (isset($utf_jamo_type[$utf_char]))
810	{
811	// Current char is a composable jamo
812	if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)
813	{
814	// We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo
815	if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)
816	{
817	// L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)
818	$cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];
819	++$k;
820	}
821	else
822	{
823	// L+V jamos, combine to a LV Hangul syllable
824	$cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];
825	}
826
827	$starter = chr(0xE0 \| ($cp >> 12)) . chr(0x80 \| (($cp >> 6) & 0x3F)) . chr(0x80 \| ($cp & 0x3F));
828	}
829	else
830	{
831	// Non-composable jamo, just add it to the sequence
832	$nf_seq .= $starter;
833	$starter = $utf_char;
834	}
835	}
836	else
837	{
838	// No composite, just add the first starter to the sequence then continue with the other one
839	$nf_seq .= $starter;
840	$starter = $utf_char;
841	}
842	}
843	else
844	{
845	ksort($utf_sort);
846
847	// For each class of combining characters
848	foreach ($utf_sort as $cc => $utf_chars)
849	{
850	$j = 0;
851
852	do
853	{
854	// Look for a composite
855	if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))
856	{
857	// Found a composite, replace the starter
858	$starter = $utf_canonical_comp[$starter . $utf_chars[$j]];
859	unset($utf_sort[$cc][$j]);
860	}
861	else
862	{
863	// No composite, all following characters in that class are blocked
864	break;
865	}
866	}
867	while (isset($utf_sort[$cc][++$j]));
868	}
869
870	// Add the starter to the normalized sequence, followed by non-starters in canonical order
871	$nf_seq .= $starter;
872
873	foreach ($utf_sort as $utf_chars)
874	{
875	if (!empty($utf_chars))
876	{
877	$nf_seq .= implode('', $utf_chars);
878	}
879	}
880
881	// Reset the array and go on
882	$utf_sort = array();
883	$starter = $utf_char;
884	}
885	}
886	}
887	while ($k <= $k_max);
888
889	$tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;
890	$tmp_pos = $pos;
891	}
892	else
893	{
894	// Only a ASCII char can make the program get here
895	//
896	// First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().
897	//
898	// The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on
899	// multi-byte text (where the only ASCII chars are spaces and punctuation)
900	if (++$pos != $len)
901	{
902	if ($str[$pos] < "\x80")
903	{
904	$pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
905	$buffer[++$i & 7] = $str[$pos - 1];
906	}
907	else
908	{
909	$buffer[++$i & 7] = $c;
910	}
911	}
912	}
913	}
914	while ($pos < $len);
915
916	// Now is time to return the string
917	if ($tmp_pos)
918	{
919	// If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version
920	if ($tmp_pos == $len)
921	{
922	// The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
923	return $tmp;
924	}
925	else
926	{
927	// The rightmost chunk of $str has not been appended to $tmp yet
928	return $tmp . substr($str, $tmp_pos);
929	}
930	}
931
932	// The string was already in normal form
933	return $str;
934	}
935
936	/**
937	* Decompose a UTF string
938	*
939	* @param string $str UTF string
940	* @param integer $pos Position of the first UTF char (in bytes)
941	* @param integer $len Length of the string (in bytes)
942	* @param array &$decomp_map Decomposition mapping, passed by reference but never modified
943	* @return string The string, decomposed and sorted canonically
944	*
945	* @access private
946	*/
947	function decompose($str, $pos, $len, &$decomp_map)
948	{
949	global $utf_combining_class;
950
951	// Load some commonly-used tables
952	if (!isset($utf_combining_class))
953	{
954	global $phpbb_root_path, $phpEx;
955	include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
956	}
957
958	// UTF char length array
959	$utf_len_mask = array(
960	// Leading bytes masks
961	"\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
962	// Trailing bytes masks
963	"\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
964	);
965
966	// Some extra checks are triggered on the first byte of a UTF sequence
967	$extra_check = array(
968	"\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
969	"\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
970	"\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
971	);
972
973	// These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:
974	// - 2-byte: 110? ???? 10?? ????
975	// - 3-byte: 1110 ???? 10?? ???? 10?? ????
976	// - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
977	// Note that 5- and 6- byte sequences are automatically discarded
978	$utf_validation_mask = array(
979	2 => "\xE0\xC0",
980	3 => "\xF0\xC0\xC0",
981	4 => "\xF8\xC0\xC0\xC0"
982	);
983
984	$utf_validation_check = array(
985	2 => "\xC0\x80",
986	3 => "\xE0\x80\x80",
987	4 => "\xF0\x80\x80\x80"
988	);
989
990	$tmp = '';
991	$starter_pos = $pos;
992	$tmp_pos = $last_cc = $sort = $dump = 0;
993	$utf_sort = array();
994
995
996	// Main loop
997	do
998	{
999	// STEP 0: Capture the current char
1000
1001	$cur_mask = $str[$pos] & "\xF0";
1002	if (isset($utf_len_mask[$cur_mask]))
1003	{
1004	if ($utf_len = $utf_len_mask[$cur_mask])
1005	{
1006	// Multibyte char
1007	$utf_char = substr($str, $pos, $utf_len);
1008	$pos += $utf_len;
1009	}
1010	else
1011	{
1012	// A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode
1013	// replacement char and we will advance the cursor
1014	$spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
1015
1016	if ($dump)
1017	{
1018	$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1019
1020	// Dump combiners
1021	if (!empty($utf_sort))
1022	{
1023	if ($sort)
1024	{
1025	ksort($utf_sort);
1026	}
1027
1028	foreach ($utf_sort as $utf_chars)
1029	{
1030	$tmp .= implode('', $utf_chars);
1031	}
1032	}
1033
1034	$tmp .= str_repeat(UTF8_REPLACEMENT, $spn);
1035	$dump = $sort = 0;
1036	}
1037	else
1038	{
1039	$tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
1040	}
1041
1042	$pos += $spn;
1043	$tmp_pos = $starter_pos = $pos;
1044
1045	$utf_sort = array();
1046	$last_cc = 0;
1047
1048	continue;
1049	}
1050
1051
1052	// STEP 1: Decide what to do with current char
1053
1054	// Now, in that order:
1055	// - check if that character is decomposable
1056	// - check if that character is a non-starter
1057	// - check if that character requires extra checks to be performed
1058	if (isset($decomp_map[$utf_char]))
1059	{
1060	// Decompose the char
1061	$_pos = 0;
1062	$_len = strlen($decomp_map[$utf_char]);
1063
1064	do
1065	{
1066	$c = $decomp_map[$utf_char][$_pos];
1067	$_utf_len =& $utf_len_mask[$c & "\xF0"];
1068
1069	if (isset($_utf_len))
1070	{
1071	$_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);
1072	$_pos += $_utf_len;
1073
1074	if (isset($utf_combining_class[$_utf_char]))
1075	{
1076	// The character decomposed to a non-starter, buffer it for sorting
1077	$utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;
1078
1079	if ($utf_combining_class[$_utf_char] < $last_cc)
1080	{
1081	// Not canonically ordered, will require sorting
1082	$sort = $dump = 1;
1083	}
1084	else
1085	{
1086	$dump = 1;
1087	$last_cc = $utf_combining_class[$_utf_char];
1088	}
1089	}
1090	else
1091	{
1092	// This character decomposition contains a starter, dump the buffer and continue
1093	if ($dump)
1094	{
1095	$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1096
1097	// Dump combiners
1098	if (!empty($utf_sort))
1099	{
1100	if ($sort)
1101	{
1102	ksort($utf_sort);
1103	}
1104
1105	foreach ($utf_sort as $utf_chars)
1106	{
1107	$tmp .= implode('', $utf_chars);
1108	}
1109	}
1110
1111	$tmp .= $_utf_char;
1112	$dump = $sort = 0;
1113	}
1114	else
1115	{
1116	$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;
1117	}
1118
1119	$tmp_pos = $starter_pos = $pos;
1120	$utf_sort = array();
1121	$last_cc = 0;
1122	}
1123	}
1124	else
1125	{
1126	// This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue
1127	++$_pos;
1128
1129	if ($dump)
1130	{
1131	$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1132
1133	// Dump combiners
1134	if (!empty($utf_sort))
1135	{
1136	if ($sort)
1137	{
1138	ksort($utf_sort);
1139	}
1140
1141	foreach ($utf_sort as $utf_chars)
1142	{
1143	$tmp .= implode('', $utf_chars);
1144	}
1145	}
1146
1147	$tmp .= $c;
1148	$dump = $sort = 0;
1149	}
1150	else
1151	{
1152	$tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;
1153	}
1154
1155	$tmp_pos = $starter_pos = $pos;
1156	$utf_sort = array();
1157	$last_cc = 0;
1158	}
1159	}
1160	while ($_pos < $_len);
1161	}
1162	else if (isset($utf_combining_class[$utf_char]))
1163	{
1164	// Combining character
1165	if ($utf_combining_class[$utf_char] < $last_cc)
1166	{
1167	// Not in canonical order
1168	$sort = $dump = 1;
1169	}
1170	else
1171	{
1172	$last_cc = $utf_combining_class[$utf_char];
1173	}
1174
1175	$utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
1176	}
1177	else
1178	{
1179	// Non-decomposable starter, check out if it's a Hangul syllable
1180	if ($utf_char < UTF8_HANGUL_FIRST \|\| $utf_char > UTF8_HANGUL_LAST)
1181	{
1182	// Nope, regular UTF char, check that we have the correct number of trailing bytes
1183	if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
1184	{
1185	// Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
1186	// has been encoded in a five- or six- byte sequence.
1187	// Move the cursor back to its original position then advance it to the position it should really be at
1188	$pos -= $utf_len;
1189	$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1190
1191	if (!empty($utf_sort))
1192	{
1193	ksort($utf_sort);
1194
1195	foreach ($utf_sort as $utf_chars)
1196	{
1197	$tmp .= implode('', $utf_chars);
1198	}
1199	$utf_sort = array();
1200	}
1201
1202	// Add a replacement char then another replacement char for every trailing byte.
1203	//
1204	// @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this
1205	$spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);
1206	$tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);
1207
1208	$dump = $sort = 0;
1209
1210	$pos += $spn;
1211	$tmp_pos = $pos;
1212	continue;
1213	}
1214
1215	if (isset($extra_check[$utf_char[0]]))
1216	{
1217	switch ($utf_char[0])
1218	{
1219	// Note: 0xED is quite common in Korean
1220	case "\xED":
1221	if ($utf_char >= "\xED\xA0\x80")
1222	{
1223	// Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
1224	$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1225
1226	if (!empty($utf_sort))
1227	{
1228	ksort($utf_sort);
1229
1230	foreach ($utf_sort as $utf_chars)
1231	{
1232	$tmp .= implode('', $utf_chars);
1233	}
1234	$utf_sort = array();
1235	}
1236
1237	$tmp .= UTF8_REPLACEMENT;
1238	$dump = $sort = 0;
1239
1240	$tmp_pos = $starter_pos = $pos;
1241	continue 2;
1242	}
1243	break;
1244
1245	// Note: 0xEF is quite common in Japanese
1246	case "\xEF":
1247	if ($utf_char == "\xEF\xBF\xBE" \|\| $utf_char == "\xEF\xBF\xBF")
1248	{
1249	// U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
1250	$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1251
1252	if (!empty($utf_sort))
1253	{
1254	ksort($utf_sort);
1255
1256	foreach ($utf_sort as $utf_chars)
1257	{
1258	$tmp .= implode('', $utf_chars);
1259	}
1260	$utf_sort = array();
1261	}
1262
1263	$tmp .= UTF8_REPLACEMENT;
1264	$dump = $sort = 0;
1265
1266	$tmp_pos = $starter_pos = $pos;
1267	continue 2;
1268	}
1269	break;
1270
1271	case "\xC0":
1272	case "\xC1":
1273	if ($utf_char <= "\xC1\xBF")
1274	{
1275	// Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
1276	$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1277
1278	if (!empty($utf_sort))
1279	{
1280	ksort($utf_sort);
1281
1282	foreach ($utf_sort as $utf_chars)
1283	{
1284	$tmp .= implode('', $utf_chars);
1285	}
1286	$utf_sort = array();
1287	}
1288
1289	$tmp .= UTF8_REPLACEMENT;
1290	$dump = $sort = 0;
1291
1292	$tmp_pos = $starter_pos = $pos;
1293	continue 2;
1294	}
1295	break;
1296
1297	case "\xE0":
1298	if ($utf_char <= "\xE0\x9F\xBF")
1299	{
1300	// Unicode char U+0000..U+07FF encoded in 3 bytes
1301	$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1302
1303	if (!empty($utf_sort))
1304	{
1305	ksort($utf_sort);
1306
1307	foreach ($utf_sort as $utf_chars)
1308	{
1309	$tmp .= implode('', $utf_chars);
1310	}
1311	$utf_sort = array();
1312	}
1313
1314	$tmp .= UTF8_REPLACEMENT;
1315	$dump = $sort = 0;
1316
1317	$tmp_pos = $starter_pos = $pos;
1318	continue 2;
1319	}
1320	break;
1321
1322	case "\xF0":
1323	if ($utf_char <= "\xF0\x8F\xBF\xBF")
1324	{
1325	// Unicode char U+0000..U+FFFF encoded in 4 bytes
1326	$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1327
1328	if (!empty($utf_sort))
1329	{
1330	ksort($utf_sort);
1331
1332	foreach ($utf_sort as $utf_chars)
1333	{
1334	$tmp .= implode('', $utf_chars);
1335	}
1336	$utf_sort = array();
1337	}
1338
1339	$tmp .= UTF8_REPLACEMENT;
1340	$dump = $sort = 0;
1341
1342	$tmp_pos = $starter_pos = $pos;
1343	continue 2;
1344	}
1345	break;
1346
1347	default:
1348	if ($utf_char > UTF8_MAX)
1349	{
1350	// Out of the Unicode range
1351	$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1352
1353	if (!empty($utf_sort))
1354	{
1355	ksort($utf_sort);
1356
1357	foreach ($utf_sort as $utf_chars)
1358	{
1359	$tmp .= implode('', $utf_chars);
1360	}
1361	$utf_sort = array();
1362	}
1363
1364	$tmp .= UTF8_REPLACEMENT;
1365	$dump = $sort = 0;
1366
1367	$tmp_pos = $starter_pos = $pos;
1368	continue 2;
1369	}
1370	break;
1371	}
1372	}
1373	}
1374	else
1375	{
1376	// Hangul syllable
1377	$idx = (((ord($utf_char[0]) & 0x0F) << 12) \| ((ord($utf_char[1]) & 0x3F) << 6) \| (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;
1378
1379	// LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
1380	//
1381	// The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
1382	if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)
1383	{
1384	if ($t_index < 25)
1385	{
1386	$utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
1387	$utf_char[8] = chr(0xA7 + $t_index);
1388	}
1389	else
1390	{
1391	$utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
1392	$utf_char[8] = chr(0x67 + $t_index);
1393	}
1394	}
1395	else
1396	{
1397	$utf_char = "\xE1\x84\x00\xE1\x85\x00";
1398	}
1399
1400	$utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));
1401	$utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));
1402
1403	// Just like other decompositions, the resulting Jamos must be dumped to the tmp string
1404	$dump = 1;
1405	}
1406
1407	// Do we need to dump stuff to the tmp string?
1408	if ($dump)
1409	{
1410	$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1411
1412	// Dump combiners
1413	if (!empty($utf_sort))
1414	{
1415	if ($sort)
1416	{
1417	ksort($utf_sort);
1418	}
1419
1420	foreach ($utf_sort as $utf_chars)
1421	{
1422	$tmp .= implode('', $utf_chars);
1423	}
1424	}
1425
1426	$tmp .= $utf_char;
1427	$dump = $sort = 0;
1428	$tmp_pos = $pos;
1429	}
1430
1431	$last_cc = 0;
1432	$utf_sort = array();
1433	$starter_pos = $pos;
1434	}
1435	}
1436	else
1437	{
1438	// ASCII char, which happens to be a starter (as any other ASCII char)
1439	if ($dump)
1440	{
1441	$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1442
1443	// Dump combiners
1444	if (!empty($utf_sort))
1445	{
1446	if ($sort)
1447	{
1448	ksort($utf_sort);
1449	}
1450
1451	foreach ($utf_sort as $utf_chars)
1452	{
1453	$tmp .= implode('', $utf_chars);
1454	}
1455	}
1456
1457	$tmp .= $str[$pos];
1458	$dump = $sort = 0;
1459	$tmp_pos = ++$pos;
1460
1461	$pos += strspn($str, UTF8_ASCII_RANGE, $pos);
1462	}
1463	else
1464	{
1465	$pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
1466	}
1467
1468	$last_cc = 0;
1469	$utf_sort = array();
1470	$starter_pos = $pos;
1471	}
1472	}
1473	while ($pos < $len);
1474
1475	// Now is time to return the string
1476	if ($dump)
1477	{
1478	$tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1479
1480	// Dump combiners
1481	if (!empty($utf_sort))
1482	{
1483	if ($sort)
1484	{
1485	ksort($utf_sort);
1486	}
1487
1488	foreach ($utf_sort as $utf_chars)
1489	{
1490	$tmp .= implode('', $utf_chars);
1491	}
1492	}
1493
1494	return $tmp;
1495	}
1496	else if ($tmp_pos)
1497	{
1498	// If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version
1499	if ($tmp_pos == $len)
1500	{
1501	// The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
1502	return $tmp;
1503	}
1504	else
1505	{
1506	// The rightmost chunk of $str has not been appended to $tmp yet
1507	return $tmp . substr($str, $tmp_pos);
1508	}
1509	}
1510
1511	// The string was already in normal form
1512	return $str;
1513	}
1514	}
1515
1516	?>

Note: See TracBrowser for help on using the repository browser.

Download in other formats: