| 1 | <?php
 | 
|---|
| 2 | /**
 | 
|---|
| 3 | *
 | 
|---|
| 4 | * @package utf
 | 
|---|
| 5 | * @version $Id: utf_normalizer.php 8479 2008-03-29 00:22:48Z naderman $
 | 
|---|
| 6 | * @copyright (c) 2005 phpBB Group
 | 
|---|
| 7 | * @license http://opensource.org/licenses/gpl-license.php GNU Public License
 | 
|---|
| 8 | *
 | 
|---|
| 9 | */
 | 
|---|
| 10 | 
 | 
|---|
| 11 | /**
 | 
|---|
| 12 | */
 | 
|---|
| 13 | if (!defined('IN_PHPBB'))
 | 
|---|
| 14 | {
 | 
|---|
| 15 |         exit;
 | 
|---|
| 16 | }
 | 
|---|
| 17 | 
 | 
|---|
| 18 | /**
 | 
|---|
| 19 | * Some Unicode characters encoded in UTF-8
 | 
|---|
| 20 | *
 | 
|---|
| 21 | * Preserved for compatibility
 | 
|---|
| 22 | */
 | 
|---|
| 23 | define('UTF8_REPLACEMENT', "\xEF\xBF\xBD");
 | 
|---|
| 24 | define('UTF8_MAX', "\xF4\x8F\xBF\xBF");
 | 
|---|
| 25 | define('UTF8_FFFE', "\xEF\xBF\xBE");
 | 
|---|
| 26 | define('UTF8_FFFF', "\xEF\xBF\xBF");
 | 
|---|
| 27 | define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80");
 | 
|---|
| 28 | define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");
 | 
|---|
| 29 | define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");
 | 
|---|
| 30 | define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");
 | 
|---|
| 31 | 
 | 
|---|
| 32 | define('UTF8_CJK_FIRST', "\xE4\xB8\x80");
 | 
|---|
| 33 | define('UTF8_CJK_LAST', "\xE9\xBE\xBB");
 | 
|---|
| 34 | define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");
 | 
|---|
| 35 | define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");
 | 
|---|
| 36 | 
 | 
|---|
| 37 | // Unset global variables
 | 
|---|
| 38 | unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
 | 
|---|
| 39 | 
 | 
|---|
| 40 | // NFC_QC and NFKC_QC values
 | 
|---|
| 41 | define('UNICODE_QC_MAYBE', 0);
 | 
|---|
| 42 | define('UNICODE_QC_NO', 1);
 | 
|---|
| 43 | 
 | 
|---|
| 44 | // Contains all the ASCII characters appearing in UTF-8, sorted by frequency
 | 
|---|
| 45 | define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F");
 | 
|---|
| 46 | 
 | 
|---|
| 47 | // Contains all the tail bytes that can appear in the composition of a UTF-8 char
 | 
|---|
| 48 | define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A");
 | 
|---|
| 49 | 
 | 
|---|
| 50 | // Constants used by the Hangul [de]composition algorithms
 | 
|---|
| 51 | define('UNICODE_HANGUL_SBASE', 0xAC00);
 | 
|---|
| 52 | define('UNICODE_HANGUL_LBASE', 0x1100);
 | 
|---|
| 53 | define('UNICODE_HANGUL_VBASE', 0x1161);
 | 
|---|
| 54 | define('UNICODE_HANGUL_TBASE', 0x11A7);
 | 
|---|
| 55 | define('UNICODE_HANGUL_SCOUNT', 11172);
 | 
|---|
| 56 | define('UNICODE_HANGUL_LCOUNT', 19);
 | 
|---|
| 57 | define('UNICODE_HANGUL_VCOUNT', 21);
 | 
|---|
| 58 | define('UNICODE_HANGUL_TCOUNT', 28);
 | 
|---|
| 59 | define('UNICODE_HANGUL_NCOUNT', 588);
 | 
|---|
| 60 | define('UNICODE_JAMO_L', 0);
 | 
|---|
| 61 | define('UNICODE_JAMO_V', 1);
 | 
|---|
| 62 | define('UNICODE_JAMO_T', 2);
 | 
|---|
| 63 | 
 | 
|---|
| 64 | /**
 | 
|---|
| 65 | * Unicode normalization routines
 | 
|---|
| 66 | *
 | 
|---|
| 67 | * @package utf
 | 
|---|
| 68 | */
 | 
|---|
| 69 | class utf_normalizer
 | 
|---|
| 70 | {
 | 
|---|
| 71 |         /**
 | 
|---|
| 72 |         * Validate, cleanup and normalize a string
 | 
|---|
| 73 |         *
 | 
|---|
| 74 |         * The ultimate convenience function! Clean up invalid UTF-8 sequences,
 | 
|---|
| 75 |         * and convert to Normal Form C, canonical composition.
 | 
|---|
| 76 |         *
 | 
|---|
| 77 |         * @param        string  &$str   The dirty string
 | 
|---|
| 78 |         * @return       string                  The same string, all shiny and cleaned-up
 | 
|---|
| 79 |         */
 | 
|---|
| 80 |         function cleanup(&$str)
 | 
|---|
| 81 |         {
 | 
|---|
| 82 |                 // The string below is the list of all autorized characters, sorted by frequency in latin text
 | 
|---|
| 83 |                 $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
 | 
|---|
| 84 |                 $len = strlen($str);
 | 
|---|
| 85 | 
 | 
|---|
| 86 |                 if ($pos == $len)
 | 
|---|
| 87 |                 {
 | 
|---|
| 88 |                         // ASCII strings with no special chars return immediately
 | 
|---|
| 89 |                         return;
 | 
|---|
| 90 |                 }
 | 
|---|
| 91 | 
 | 
|---|
| 92 |                 // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
 | 
|---|
| 93 |                 if (!isset($GLOBALS['utf_nfc_qc']))
 | 
|---|
| 94 |                 {
 | 
|---|
| 95 |                         global $phpbb_root_path, $phpEx;
 | 
|---|
| 96 |                         include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
 | 
|---|
| 97 |                 }
 | 
|---|
| 98 | 
 | 
|---|
| 99 |                 if (!isset($GLOBALS['utf_canonical_decomp']))
 | 
|---|
| 100 |                 {
 | 
|---|
| 101 |                         global $phpbb_root_path, $phpEx;
 | 
|---|
| 102 |                         include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
 | 
|---|
| 103 |                 }
 | 
|---|
| 104 | 
 | 
|---|
| 105 |                 // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
 | 
|---|
| 106 |                 // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
 | 
|---|
| 107 |                 $str = strtr(
 | 
|---|
| 108 |                         $str,
 | 
|---|
| 109 |                         "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
 | 
|---|
| 110 |                         "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
 | 
|---|
| 111 |                 );
 | 
|---|
| 112 | 
 | 
|---|
| 113 |                 $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
 | 
|---|
| 114 |         }
 | 
|---|
| 115 | 
 | 
|---|
| 116 |         /**
 | 
|---|
| 117 |         * Validate and normalize a UTF string to NFC
 | 
|---|
| 118 |         *
 | 
|---|
| 119 |         * @param        string  &$str   Unchecked UTF string
 | 
|---|
| 120 |         * @return       string                  The string, validated and in normal form
 | 
|---|
| 121 |         */
 | 
|---|
| 122 |         function nfc(&$str)
 | 
|---|
| 123 |         {
 | 
|---|
| 124 |                 $pos = strspn($str, UTF8_ASCII_RANGE);
 | 
|---|
| 125 |                 $len = strlen($str);
 | 
|---|
| 126 | 
 | 
|---|
| 127 |                 if ($pos == $len)
 | 
|---|
| 128 |                 {
 | 
|---|
| 129 |                         // ASCII strings return immediately
 | 
|---|
| 130 |                         return;
 | 
|---|
| 131 |                 }
 | 
|---|
| 132 | 
 | 
|---|
| 133 |                 if (!isset($GLOBALS['utf_nfc_qc']))
 | 
|---|
| 134 |                 {
 | 
|---|
| 135 |                         global $phpbb_root_path, $phpEx;
 | 
|---|
| 136 |                         include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
 | 
|---|
| 137 |                 }
 | 
|---|
| 138 | 
 | 
|---|
| 139 |                 if (!isset($GLOBALS['utf_canonical_decomp']))
 | 
|---|
| 140 |                 {
 | 
|---|
| 141 |                         global $phpbb_root_path, $phpEx;
 | 
|---|
| 142 |                         include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
 | 
|---|
| 143 |                 }
 | 
|---|
| 144 | 
 | 
|---|
| 145 |                 $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
 | 
|---|
| 146 |         }
 | 
|---|
| 147 | 
 | 
|---|
| 148 |         /**
 | 
|---|
| 149 |         * Validate and normalize a UTF string to NFKC
 | 
|---|
| 150 |         *
 | 
|---|
| 151 |         * @param        string  &$str   Unchecked UTF string
 | 
|---|
| 152 |         * @return       string                  The string, validated and in normal form
 | 
|---|
| 153 |         */
 | 
|---|
| 154 |         function nfkc(&$str)
 | 
|---|
| 155 |         {
 | 
|---|
| 156 |                 $pos = strspn($str, UTF8_ASCII_RANGE);
 | 
|---|
| 157 |                 $len = strlen($str);
 | 
|---|
| 158 | 
 | 
|---|
| 159 |                 if ($pos == $len)
 | 
|---|
| 160 |                 {
 | 
|---|
| 161 |                         // ASCII strings return immediately
 | 
|---|
| 162 |                         return;
 | 
|---|
| 163 |                 }
 | 
|---|
| 164 | 
 | 
|---|
| 165 |                 if (!isset($GLOBALS['utf_nfkc_qc']))
 | 
|---|
| 166 |                 {
 | 
|---|
| 167 |                         global $phpbb_root_path, $phpEx;
 | 
|---|
| 168 |                         include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);
 | 
|---|
| 169 |                 }
 | 
|---|
| 170 | 
 | 
|---|
| 171 |                 if (!isset($GLOBALS['utf_compatibility_decomp']))
 | 
|---|
| 172 |                 {
 | 
|---|
| 173 |                         global $phpbb_root_path, $phpEx;
 | 
|---|
| 174 |                         include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
 | 
|---|
| 175 |                 }
 | 
|---|
| 176 | 
 | 
|---|
| 177 |                 $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
 | 
|---|
| 178 |         }
 | 
|---|
| 179 | 
 | 
|---|
| 180 |         /**
 | 
|---|
| 181 |         * Validate and normalize a UTF string to NFD
 | 
|---|
| 182 |         *
 | 
|---|
| 183 |         * @param        string  &$str   Unchecked UTF string
 | 
|---|
| 184 |         * @return       string                  The string, validated and in normal form
 | 
|---|
| 185 |         */
 | 
|---|
| 186 |         function nfd(&$str)
 | 
|---|
| 187 |         {
 | 
|---|
| 188 |                 $pos = strspn($str, UTF8_ASCII_RANGE);
 | 
|---|
| 189 |                 $len = strlen($str);
 | 
|---|
| 190 | 
 | 
|---|
| 191 |                 if ($pos == $len)
 | 
|---|
| 192 |                 {
 | 
|---|
| 193 |                         // ASCII strings return immediately
 | 
|---|
| 194 |                         return;
 | 
|---|
| 195 |                 }
 | 
|---|
| 196 | 
 | 
|---|
| 197 |                 if (!isset($GLOBALS['utf_canonical_decomp']))
 | 
|---|
| 198 |                 {
 | 
|---|
| 199 |                         global $phpbb_root_path, $phpEx;
 | 
|---|
| 200 |                         include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
 | 
|---|
| 201 |                 }
 | 
|---|
| 202 | 
 | 
|---|
| 203 |                 $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
 | 
|---|
| 204 |         }
 | 
|---|
| 205 | 
 | 
|---|
| 206 |         /**
 | 
|---|
| 207 |         * Validate and normalize a UTF string to NFKD
 | 
|---|
| 208 |         *
 | 
|---|
| 209 |         * @param        string  &$str   Unchecked UTF string
 | 
|---|
| 210 |         * @return       string                  The string, validated and in normal form
 | 
|---|
| 211 |         */
 | 
|---|
| 212 |         function nfkd(&$str)
 | 
|---|
| 213 |         {
 | 
|---|
| 214 |                 $pos = strspn($str, UTF8_ASCII_RANGE);
 | 
|---|
| 215 |                 $len = strlen($str);
 | 
|---|
| 216 | 
 | 
|---|
| 217 |                 if ($pos == $len)
 | 
|---|
| 218 |                 {
 | 
|---|
| 219 |                         // ASCII strings return immediately
 | 
|---|
| 220 |                         return;
 | 
|---|
| 221 |                 }
 | 
|---|
| 222 | 
 | 
|---|
| 223 |                 if (!isset($GLOBALS['utf_compatibility_decomp']))
 | 
|---|
| 224 |                 {
 | 
|---|
| 225 |                         global $phpbb_root_path, $phpEx;
 | 
|---|
| 226 |                         include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
 | 
|---|
| 227 |                 }
 | 
|---|
| 228 | 
 | 
|---|
| 229 |                 $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
 | 
|---|
| 230 |         }
 | 
|---|
| 231 | 
 | 
|---|
| 232 | 
 | 
|---|
| 233 |         /**
 | 
|---|
| 234 |         * Recompose a UTF string
 | 
|---|
| 235 |         *
 | 
|---|
| 236 |         * @param        string  $str                    Unchecked UTF string
 | 
|---|
| 237 |         * @param        integer $pos                    Position of the first UTF char (in bytes)
 | 
|---|
| 238 |         * @param        integer $len                    Length of the string (in bytes)
 | 
|---|
| 239 |         * @param        array   &$qc                    Quick-check array, passed by reference but never modified
 | 
|---|
| 240 |         * @param        array   &$decomp_map    Decomposition mapping, passed by reference but never modified
 | 
|---|
| 241 |         * @return       string                                  The string, validated and recomposed
 | 
|---|
| 242 |         *
 | 
|---|
| 243 |         * @access       private
 | 
|---|
| 244 |         */
 | 
|---|
| 245 |         function recompose($str, $pos, $len, &$qc, &$decomp_map)
 | 
|---|
| 246 |         {
 | 
|---|
| 247 |                 global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index;
 | 
|---|
| 248 | 
 | 
|---|
| 249 |                 // Load some commonly-used tables
 | 
|---|
| 250 |                 if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class))
 | 
|---|
| 251 |                 {
 | 
|---|
| 252 |                         global $phpbb_root_path, $phpEx;
 | 
|---|
| 253 |                         include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
 | 
|---|
| 254 |                 }
 | 
|---|
| 255 | 
 | 
|---|
| 256 |                 // Load the canonical composition table
 | 
|---|
| 257 |                 if (!isset($utf_canonical_comp))
 | 
|---|
| 258 |                 {
 | 
|---|
| 259 |                         global $phpbb_root_path, $phpEx;
 | 
|---|
| 260 |                         include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
 | 
|---|
| 261 |                 }
 | 
|---|
| 262 | 
 | 
|---|
| 263 |                 // Buffer the last ASCII char before the UTF-8 stuff if applicable
 | 
|---|
| 264 |                 $tmp = '';
 | 
|---|
| 265 |                 $i = $tmp_pos = $last_cc = 0;
 | 
|---|
| 266 | 
 | 
|---|
| 267 |                 $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();
 | 
|---|
| 268 | 
 | 
|---|
| 269 |                 // UTF char length array
 | 
|---|
| 270 |                 // This array is used to determine the length of a UTF character.
 | 
|---|
| 271 |                 // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos
 | 
|---|
| 272 |                 // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.
 | 
|---|
| 273 |                 // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character
 | 
|---|
| 274 |                 // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
 | 
|---|
| 275 |                 $utf_len_mask = array(
 | 
|---|
| 276 |                         // Leading bytes masks
 | 
|---|
| 277 |                         "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
 | 
|---|
| 278 |                         // Trailing bytes masks
 | 
|---|
| 279 |                         "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
 | 
|---|
| 280 |                 );
 | 
|---|
| 281 | 
 | 
|---|
| 282 |                 $extra_check = array(
 | 
|---|
| 283 |                         "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
 | 
|---|
| 284 |                         "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
 | 
|---|
| 285 |                         "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
 | 
|---|
| 286 |                 );
 | 
|---|
| 287 | 
 | 
|---|
| 288 |                 $utf_validation_mask = array(
 | 
|---|
| 289 |                         2       => "\xE0\xC0",
 | 
|---|
| 290 |                         3       => "\xF0\xC0\xC0",
 | 
|---|
| 291 |                         4       => "\xF8\xC0\xC0\xC0"
 | 
|---|
| 292 |                 );
 | 
|---|
| 293 | 
 | 
|---|
| 294 |                 $utf_validation_check = array(
 | 
|---|
| 295 |                         2       => "\xC0\x80",
 | 
|---|
| 296 |                         3       => "\xE0\x80\x80",
 | 
|---|
| 297 |                         4       => "\xF0\x80\x80\x80"
 | 
|---|
| 298 |                 );
 | 
|---|
| 299 | 
 | 
|---|
| 300 |                 // Main loop
 | 
|---|
| 301 |                 do
 | 
|---|
| 302 |                 {
 | 
|---|
| 303 |                         // STEP 0: Capture the current char and buffer it
 | 
|---|
| 304 |                         $c = $str[$pos];
 | 
|---|
| 305 |                         $c_mask = $c & "\xF0";
 | 
|---|
| 306 | 
 | 
|---|
| 307 |                         if (isset($utf_len_mask[$c_mask]))
 | 
|---|
| 308 |                         {
 | 
|---|
| 309 |                                 // Byte at $pos is either a leading byte or a missplaced trailing byte
 | 
|---|
| 310 |                                 if ($utf_len = $utf_len_mask[$c_mask])
 | 
|---|
| 311 |                                 {
 | 
|---|
| 312 |                                         // Capture the char
 | 
|---|
| 313 |                                         $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len);
 | 
|---|
| 314 | 
 | 
|---|
| 315 |                                         // Let's find out if a thorough check is needed
 | 
|---|
| 316 |                                         if (isset($qc[$utf_char]))
 | 
|---|
| 317 |                                         {
 | 
|---|
| 318 |                                                 // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block
 | 
|---|
| 319 |                                         }
 | 
|---|
| 320 |                                         else if (isset($utf_combining_class[$utf_char]))
 | 
|---|
| 321 |                                         {
 | 
|---|
| 322 |                                                 if ($utf_combining_class[$utf_char] < $last_cc)
 | 
|---|
| 323 |                                                 {
 | 
|---|
| 324 |                                                         // A combining character that is NOT canonically ordered
 | 
|---|
| 325 |                                                 }
 | 
|---|
| 326 |                                                 else
 | 
|---|
| 327 |                                                 {
 | 
|---|
| 328 |                                                         // A combining character that IS canonically ordered, skip to the next char
 | 
|---|
| 329 |                                                         $last_cc = $utf_combining_class[$utf_char];
 | 
|---|
| 330 | 
 | 
|---|
| 331 |                                                         $pos += $utf_len;
 | 
|---|
| 332 |                                                         continue;
 | 
|---|
| 333 |                                                 }
 | 
|---|
| 334 |                                         }
 | 
|---|
| 335 |                                         else
 | 
|---|
| 336 |                                         {
 | 
|---|
| 337 |                                                 // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.
 | 
|---|
| 338 |                                                 // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out
 | 
|---|
| 339 |                                                 $last_cc = 0;
 | 
|---|
| 340 | 
 | 
|---|
| 341 |                                                 // Check that we have the correct number of trailing bytes
 | 
|---|
| 342 |                                                 if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
 | 
|---|
| 343 |                                                 {
 | 
|---|
| 344 |                                                         // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
 | 
|---|
| 345 |                                                         // has been encoded in a five- or six- byte sequence
 | 
|---|
| 346 |                                                         if ($utf_char[0] >= "\xF8")
 | 
|---|
| 347 |                                                         {
 | 
|---|
| 348 |                                                                 if ($utf_char[0] < "\xFC")
 | 
|---|
| 349 |                                                                 {
 | 
|---|
| 350 |                                                                         $trailing_bytes = 4;
 | 
|---|
| 351 |                                                                 }
 | 
|---|
| 352 |                                                                 else if ($utf_char[0] > "\xFD")
 | 
|---|
| 353 |                                                                 {
 | 
|---|
| 354 |                                                                         $trailing_bytes = 0;
 | 
|---|
| 355 |                                                                 }
 | 
|---|
| 356 |                                                                 else
 | 
|---|
| 357 |                                                                 {
 | 
|---|
| 358 |                                                                         $trailing_bytes = 5;
 | 
|---|
| 359 |                                                                 }
 | 
|---|
| 360 |                                                         }
 | 
|---|
| 361 |                                                         else
 | 
|---|
| 362 |                                                         {
 | 
|---|
| 363 |                                                                 $trailing_bytes = $utf_len - 1;
 | 
|---|
| 364 |                                                         }
 | 
|---|
| 365 | 
 | 
|---|
| 366 |                                                         $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 | 
|---|
| 367 |                                                         $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
 | 
|---|
| 368 |                                                         $tmp_pos = $pos;
 | 
|---|
| 369 | 
 | 
|---|
| 370 |                                                         continue;
 | 
|---|
| 371 |                                                 }
 | 
|---|
| 372 | 
 | 
|---|
| 373 |                                                 if (isset($extra_check[$c]))
 | 
|---|
| 374 |                                                 {
 | 
|---|
| 375 |                                                         switch ($c)
 | 
|---|
| 376 |                                                         {
 | 
|---|
| 377 |                                                                 // Note: 0xED is quite common in Korean
 | 
|---|
| 378 |                                                                 case "\xED":
 | 
|---|
| 379 |                                                                         if ($utf_char >= "\xED\xA0\x80")
 | 
|---|
| 380 |                                                                         {
 | 
|---|
| 381 |                                                                                 // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
 | 
|---|
| 382 |                                                                                 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 | 
|---|
| 383 |                                                                                 $pos += $utf_len;
 | 
|---|
| 384 |                                                                                 $tmp_pos = $pos;
 | 
|---|
| 385 |                                                                                 continue 2;
 | 
|---|
| 386 |                                                                         }
 | 
|---|
| 387 |                                                                 break;
 | 
|---|
| 388 | 
 | 
|---|
| 389 |                                                                 // Note: 0xEF is quite common in Japanese
 | 
|---|
| 390 |                                                                 case "\xEF":
 | 
|---|
| 391 |                                                                         if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
 | 
|---|
| 392 |                                                                         {
 | 
|---|
| 393 |                                                                                 // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
 | 
|---|
| 394 |                                                                                 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 | 
|---|
| 395 |                                                                                 $pos += $utf_len;
 | 
|---|
| 396 |                                                                                 $tmp_pos = $pos;
 | 
|---|
| 397 |                                                                                 continue 2;
 | 
|---|
| 398 |                                                                         }
 | 
|---|
| 399 |                                                                 break;
 | 
|---|
| 400 | 
 | 
|---|
| 401 |                                                                 case "\xC0":
 | 
|---|
| 402 |                                                                 case "\xC1":
 | 
|---|
| 403 |                                                                         if ($utf_char <= "\xC1\xBF")
 | 
|---|
| 404 |                                                                         {
 | 
|---|
| 405 |                                                                                 // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
 | 
|---|
| 406 |                                                                                 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 | 
|---|
| 407 |                                                                                 $pos += $utf_len;
 | 
|---|
| 408 |                                                                                 $tmp_pos = $pos;
 | 
|---|
| 409 |                                                                                 continue 2;
 | 
|---|
| 410 |                                                                         }
 | 
|---|
| 411 |                                                                 break;
 | 
|---|
| 412 | 
 | 
|---|
| 413 |                                                                 case "\xE0":
 | 
|---|
| 414 |                                                                         if ($utf_char <= "\xE0\x9F\xBF")
 | 
|---|
| 415 |                                                                         {
 | 
|---|
| 416 |                                                                                 // Unicode char U+0000..U+07FF encoded in 3 bytes
 | 
|---|
| 417 |                                                                                 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 | 
|---|
| 418 |                                                                                 $pos += $utf_len;
 | 
|---|
| 419 |                                                                                 $tmp_pos = $pos;
 | 
|---|
| 420 |                                                                                 continue 2;
 | 
|---|
| 421 |                                                                         }
 | 
|---|
| 422 |                                                                 break;
 | 
|---|
| 423 | 
 | 
|---|
| 424 |                                                                 case "\xF0":
 | 
|---|
| 425 |                                                                         if ($utf_char <= "\xF0\x8F\xBF\xBF")
 | 
|---|
| 426 |                                                                         {
 | 
|---|
| 427 |                                                                                 // Unicode char U+0000..U+FFFF encoded in 4 bytes
 | 
|---|
| 428 |                                                                                 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 | 
|---|
| 429 |                                                                                 $pos += $utf_len;
 | 
|---|
| 430 |                                                                                 $tmp_pos = $pos;
 | 
|---|
| 431 |                                                                                 continue 2;
 | 
|---|
| 432 |                                                                         }
 | 
|---|
| 433 |                                                                 break;
 | 
|---|
| 434 | 
 | 
|---|
| 435 |                                                                 default:
 | 
|---|
| 436 |                                                                         // Five- and six- byte sequences do not need being checked for here anymore
 | 
|---|
| 437 |                                                                         if ($utf_char > UTF8_MAX)
 | 
|---|
| 438 |                                                                         {
 | 
|---|
| 439 |                                                                                 // Out of the Unicode range
 | 
|---|
| 440 |                                                                                 if ($utf_char[0] < "\xF8")
 | 
|---|
| 441 |                                                                                 {
 | 
|---|
| 442 |                                                                                         $trailing_bytes = 3;
 | 
|---|
| 443 |                                                                                 }
 | 
|---|
| 444 |                                                                                 else if ($utf_char[0] < "\xFC")
 | 
|---|
| 445 |                                                                                 {
 | 
|---|
| 446 |                                                                                         $trailing_bytes = 4;
 | 
|---|
| 447 |                                                                                 }
 | 
|---|
| 448 |                                                                                 else if ($utf_char[0] > "\xFD")
 | 
|---|
| 449 |                                                                                 {
 | 
|---|
| 450 |                                                                                         $trailing_bytes = 0;
 | 
|---|
| 451 |                                                                                 }
 | 
|---|
| 452 |                                                                                 else
 | 
|---|
| 453 |                                                                                 {
 | 
|---|
| 454 |                                                                                         $trailing_bytes = 5;
 | 
|---|
| 455 |                                                                                 }
 | 
|---|
| 456 | 
 | 
|---|
| 457 |                                                                                 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 | 
|---|
| 458 |                                                                                 $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
 | 
|---|
| 459 |                                                                                 $tmp_pos = $pos;
 | 
|---|
| 460 |                                                                                 continue 2;
 | 
|---|
| 461 |                                                                         }
 | 
|---|
| 462 |                                                                 break;
 | 
|---|
| 463 |                                                         }
 | 
|---|
| 464 |                                                 }
 | 
|---|
| 465 | 
 | 
|---|
| 466 |                                                 // The char is a valid starter, move the cursor and go on
 | 
|---|
| 467 |                                                 $pos += $utf_len;
 | 
|---|
| 468 |                                                 continue;
 | 
|---|
| 469 |                                         }
 | 
|---|
| 470 |                                 }
 | 
|---|
| 471 |                                 else
 | 
|---|
| 472 |                                 {
 | 
|---|
| 473 |                                         // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if
 | 
|---|
| 474 |                                         // each of them was a Unicode replacement char
 | 
|---|
| 475 |                                         $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
 | 
|---|
| 476 |                                         $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
 | 
|---|
| 477 | 
 | 
|---|
| 478 |                                         $pos += $spn;
 | 
|---|
| 479 |                                         $tmp_pos = $pos;
 | 
|---|
| 480 |                                         continue;
 | 
|---|
| 481 |                                 }
 | 
|---|
| 482 | 
 | 
|---|
| 483 | 
 | 
|---|
| 484 |                                 // STEP 1: Decompose current char
 | 
|---|
| 485 | 
 | 
|---|
| 486 |                                 // We have found a character that is either:
 | 
|---|
| 487 |                                 //  - in the NFC_QC/NFKC_QC list
 | 
|---|
| 488 |                                 //  - a non-starter char that is not canonically ordered
 | 
|---|
| 489 |                                 //
 | 
|---|
| 490 |                                 // We are going to capture the shortest UTF sequence that satisfies these two conditions:
 | 
|---|
| 491 |                                 //
 | 
|---|
| 492 |                                 //  1 - If the sequence does not start at the begginning of the string, it must begin with a starter,
 | 
|---|
| 493 |                                 // and that starter must not have the NF[K]C_QC property equal to "MAYBE"
 | 
|---|
| 494 |                                 //
 | 
|---|
| 495 |                                 //  2 - If the sequence does not end at the end of the string, it must end with a non-starter and be
 | 
|---|
| 496 |                                 // immediately followed by a starter that is not on the QC list
 | 
|---|
| 497 |                                 //
 | 
|---|
| 498 |                                 $utf_seq = array();
 | 
|---|
| 499 |                                 $last_cc = 0;
 | 
|---|
| 500 |                                 $lpos = $pos;
 | 
|---|
| 501 |                                 $pos += $utf_len;
 | 
|---|
| 502 | 
 | 
|---|
| 503 |                                 if (isset($decomp_map[$utf_char]))
 | 
|---|
| 504 |                                 {
 | 
|---|
| 505 |                                         $_pos = 0;
 | 
|---|
| 506 |                                         $_len = strlen($decomp_map[$utf_char]);
 | 
|---|
| 507 | 
 | 
|---|
| 508 |                                         do
 | 
|---|
| 509 |                                         {
 | 
|---|
| 510 |                                                 $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];
 | 
|---|
| 511 | 
 | 
|---|
| 512 |                                                 if (isset($_utf_len))
 | 
|---|
| 513 |                                                 {
 | 
|---|
| 514 |                                                         $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 | 
|---|
| 515 |                                                         $_pos += $_utf_len;
 | 
|---|
| 516 |                                                 }
 | 
|---|
| 517 |                                                 else
 | 
|---|
| 518 |                                                 {
 | 
|---|
| 519 |                                                         $utf_seq[] = $decomp_map[$utf_char][$_pos];
 | 
|---|
| 520 |                                                         ++$_pos;
 | 
|---|
| 521 |                                                 }
 | 
|---|
| 522 |                                         }
 | 
|---|
| 523 |                                         while ($_pos < $_len);
 | 
|---|
| 524 |                                 }
 | 
|---|
| 525 |                                 else
 | 
|---|
| 526 |                                 {
 | 
|---|
| 527 |                                         // The char is not decomposable
 | 
|---|
| 528 |                                         $utf_seq = array($utf_char);
 | 
|---|
| 529 |                                 }
 | 
|---|
| 530 | 
 | 
|---|
| 531 | 
 | 
|---|
| 532 |                                 // STEP 2: Capture the starter
 | 
|---|
| 533 | 
 | 
|---|
| 534 |                                 // Check out the combining class of the first character of the UTF sequence
 | 
|---|
| 535 |                                 $k = 0;
 | 
|---|
| 536 |                                 if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)
 | 
|---|
| 537 |                                 {
 | 
|---|
| 538 |                                         // Not a starter, inspect previous characters
 | 
|---|
| 539 |                                         // The last 8 characters are kept in a buffer so that we don't have to capture them everytime.
 | 
|---|
| 540 |                                         // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,
 | 
|---|
| 541 |                                         // although it is slower than this method.
 | 
|---|
| 542 |                                         //
 | 
|---|
| 543 |                                         // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is
 | 
|---|
| 544 |                                         // at offset $i) and process them in backward mode until we find a starter.
 | 
|---|
| 545 |                                         //
 | 
|---|
| 546 |                                         // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more
 | 
|---|
| 547 |                                         // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering
 | 
|---|
| 548 |                                         $starter_found = 0;
 | 
|---|
| 549 |                                         $j_min = max(1, $i - 7);
 | 
|---|
| 550 | 
 | 
|---|
| 551 |                                         for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)
 | 
|---|
| 552 |                                         {
 | 
|---|
| 553 |                                                 $utf_char = $buffer[$j & 7];
 | 
|---|
| 554 |                                                 $lpos -= strlen($utf_char);
 | 
|---|
| 555 | 
 | 
|---|
| 556 |                                                 if (isset($decomp_map[$utf_char]))
 | 
|---|
| 557 |                                                 {
 | 
|---|
| 558 |                                                         // The char is a composite, decompose for storage
 | 
|---|
| 559 |                                                         $decomp_seq = array();
 | 
|---|
| 560 |                                                         $_pos = 0;
 | 
|---|
| 561 |                                                         $_len = strlen($decomp_map[$utf_char]);
 | 
|---|
| 562 | 
 | 
|---|
| 563 |                                                         do
 | 
|---|
| 564 |                                                         {
 | 
|---|
| 565 |                                                                 $c = $decomp_map[$utf_char][$_pos];
 | 
|---|
| 566 |                                                                 $_utf_len =& $utf_len_mask[$c & "\xF0"];
 | 
|---|
| 567 | 
 | 
|---|
| 568 |                                                                 if (isset($_utf_len))
 | 
|---|
| 569 |                                                                 {
 | 
|---|
| 570 |                                                                         $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 | 
|---|
| 571 |                                                                         $_pos += $_utf_len;
 | 
|---|
| 572 |                                                                 }
 | 
|---|
| 573 |                                                                 else
 | 
|---|
| 574 |                                                                 {
 | 
|---|
| 575 |                                                                         $decomp_seq[] = $c;
 | 
|---|
| 576 |                                                                         ++$_pos;
 | 
|---|
| 577 |                                                                 }
 | 
|---|
| 578 |                                                         }
 | 
|---|
| 579 |                                                         while ($_pos < $_len);
 | 
|---|
| 580 | 
 | 
|---|
| 581 |                                                         // Prepend the UTF sequence with our decomposed sequence
 | 
|---|
| 582 |                                                         if (isset($decomp_seq[1]))
 | 
|---|
| 583 |                                                         {
 | 
|---|
| 584 |                                                                 // The char expanded into several chars
 | 
|---|
| 585 |                                                                 $decomp_cnt = sizeof($decomp_seq);
 | 
|---|
| 586 | 
 | 
|---|
| 587 |                                                                 foreach ($decomp_seq as $decomp_i => $decomp_char)
 | 
|---|
| 588 |                                                                 {
 | 
|---|
| 589 |                                                                         $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;
 | 
|---|
| 590 |                                                                 }
 | 
|---|
| 591 |                                                                 $k -= $decomp_cnt;
 | 
|---|
| 592 |                                                         }
 | 
|---|
| 593 |                                                         else
 | 
|---|
| 594 |                                                         {
 | 
|---|
| 595 |                                                                 // Decomposed to a single char, easier to prepend
 | 
|---|
| 596 |                                                                 $utf_seq[--$k] = $decomp_seq[0];
 | 
|---|
| 597 |                                                         }
 | 
|---|
| 598 |                                                 }
 | 
|---|
| 599 |                                                 else
 | 
|---|
| 600 |                                                 {
 | 
|---|
| 601 |                                                         $utf_seq[--$k] = $utf_char;
 | 
|---|
| 602 |                                                 }
 | 
|---|
| 603 | 
 | 
|---|
| 604 |                                                 if (!isset($utf_combining_class[$utf_seq[$k]]))
 | 
|---|
| 605 |                                                 {
 | 
|---|
| 606 |                                                         // We have found our starter
 | 
|---|
| 607 |                                                         $starter_found = 1;
 | 
|---|
| 608 |                                                         break;
 | 
|---|
| 609 |                                                 }
 | 
|---|
| 610 |                                         }
 | 
|---|
| 611 | 
 | 
|---|
| 612 |                                         if (!$starter_found && $lpos > $tmp_pos)
 | 
|---|
| 613 |                                         {
 | 
|---|
| 614 |                                                 // The starter was not found in the buffer, let's rewind some more
 | 
|---|
| 615 |                                                 do
 | 
|---|
| 616 |                                                 {
 | 
|---|
| 617 |                                                         // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.
 | 
|---|
| 618 |                                                         $c = $str[--$lpos];
 | 
|---|
| 619 |                                                         $c_mask = $c & "\xF0";
 | 
|---|
| 620 | 
 | 
|---|
| 621 |                                                         if (isset($utf_len_mask[$c_mask]))
 | 
|---|
| 622 |                                                         {
 | 
|---|
| 623 |                                                                 // UTF byte
 | 
|---|
| 624 |                                                                 if ($utf_len = $utf_len_mask[$c_mask])
 | 
|---|
| 625 |                                                                 {
 | 
|---|
| 626 |                                                                         // UTF *leading* byte
 | 
|---|
| 627 |                                                                         $utf_char = substr($str, $lpos, $utf_len);
 | 
|---|
| 628 | 
 | 
|---|
| 629 |                                                                         if (isset($decomp_map[$utf_char]))
 | 
|---|
| 630 |                                                                         {
 | 
|---|
| 631 |                                                                                 // Decompose the character
 | 
|---|
| 632 |                                                                                 $decomp_seq = array();
 | 
|---|
| 633 |                                                                                 $_pos = 0;
 | 
|---|
| 634 |                                                                                 $_len = strlen($decomp_map[$utf_char]);
 | 
|---|
| 635 | 
 | 
|---|
| 636 |                                                                                 do
 | 
|---|
| 637 |                                                                                 {
 | 
|---|
| 638 |                                                                                         $c = $decomp_map[$utf_char][$_pos];
 | 
|---|
| 639 |                                                                                         $_utf_len =& $utf_len_mask[$c & "\xF0"];
 | 
|---|
| 640 | 
 | 
|---|
| 641 |                                                                                         if (isset($_utf_len))
 | 
|---|
| 642 |                                                                                         {
 | 
|---|
| 643 |                                                                                                 $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 | 
|---|
| 644 |                                                                                                 $_pos += $_utf_len;
 | 
|---|
| 645 |                                                                                         }
 | 
|---|
| 646 |                                                                                         else
 | 
|---|
| 647 |                                                                                         {
 | 
|---|
| 648 |                                                                                                 $decomp_seq[] = $c;
 | 
|---|
| 649 |                                                                                                 ++$_pos;
 | 
|---|
| 650 |                                                                                         }
 | 
|---|
| 651 |                                                                                 }
 | 
|---|
| 652 |                                                                                 while ($_pos < $_len);
 | 
|---|
| 653 | 
 | 
|---|
| 654 |                                                                                 // Prepend the UTF sequence with our decomposed sequence
 | 
|---|
| 655 |                                                                                 if (isset($decomp_seq[1]))
 | 
|---|
| 656 |                                                                                 {
 | 
|---|
| 657 |                                                                                         // The char expanded into several chars
 | 
|---|
| 658 |                                                                                         $decomp_cnt = sizeof($decomp_seq);
 | 
|---|
| 659 |                                                                                         foreach ($decomp_seq as $decomp_i => $utf_char)
 | 
|---|
| 660 |                                                                                         {
 | 
|---|
| 661 |                                                                                                 $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;
 | 
|---|
| 662 |                                                                                         }
 | 
|---|
| 663 |                                                                                         $k -= $decomp_cnt;
 | 
|---|
| 664 |                                                                                 }
 | 
|---|
| 665 |                                                                                 else
 | 
|---|
| 666 |                                                                                 {
 | 
|---|
| 667 |                                                                                         // Decomposed to a single char, easier to prepend
 | 
|---|
| 668 |                                                                                         $utf_seq[--$k] = $decomp_seq[0];
 | 
|---|
| 669 |                                                                                 }
 | 
|---|
| 670 |                                                                         }
 | 
|---|
| 671 |                                                                         else
 | 
|---|
| 672 |                                                                         {
 | 
|---|
| 673 |                                                                                 $utf_seq[--$k] = $utf_char;
 | 
|---|
| 674 |                                                                         }
 | 
|---|
| 675 |                                                                 }
 | 
|---|
| 676 |                                                         }
 | 
|---|
| 677 |                                                         else
 | 
|---|
| 678 |                                                         {
 | 
|---|
| 679 |                                                                 // ASCII char
 | 
|---|
| 680 |                                                                 $utf_seq[--$k] = $c;
 | 
|---|
| 681 |                                                         }
 | 
|---|
| 682 |                                                 }
 | 
|---|
| 683 |                                                 while ($lpos > $tmp_pos);
 | 
|---|
| 684 |                                         }
 | 
|---|
| 685 |                                 }
 | 
|---|
| 686 | 
 | 
|---|
| 687 | 
 | 
|---|
| 688 |                                 // STEP 3: Capture following combining modifiers
 | 
|---|
| 689 | 
 | 
|---|
| 690 |                                 while ($pos < $len)
 | 
|---|
| 691 |                                 {
 | 
|---|
| 692 |                                         $c_mask = $str[$pos] & "\xF0";
 | 
|---|
| 693 | 
 | 
|---|
| 694 |                                         if (isset($utf_len_mask[$c_mask]))
 | 
|---|
| 695 |                                         {
 | 
|---|
| 696 |                                                 if ($utf_len = $utf_len_mask[$c_mask])
 | 
|---|
| 697 |                                                 {
 | 
|---|
| 698 |                                                         $utf_char = substr($str, $pos, $utf_len);
 | 
|---|
| 699 |                                                 }
 | 
|---|
| 700 |                                                 else
 | 
|---|
| 701 |                                                 {
 | 
|---|
| 702 |                                                         // A trailing byte came out of nowhere
 | 
|---|
| 703 |                                                         // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop
 | 
|---|
| 704 |                                                         // as if it was a starter (replacement chars ARE starters) and let the next loop replace it
 | 
|---|
| 705 |                                                         break;
 | 
|---|
| 706 |                                                 }
 | 
|---|
| 707 | 
 | 
|---|
| 708 |                                                 if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char]))
 | 
|---|
| 709 |                                                 {
 | 
|---|
| 710 |                                                         // Combining character, add it to the sequence and move the cursor
 | 
|---|
| 711 |                                                         if (isset($decomp_map[$utf_char]))
 | 
|---|
| 712 |                                                         {
 | 
|---|
| 713 |                                                                 // Decompose the character
 | 
|---|
| 714 |                                                                 $_pos = 0;
 | 
|---|
| 715 |                                                                 $_len = strlen($decomp_map[$utf_char]);
 | 
|---|
| 716 | 
 | 
|---|
| 717 |                                                                 do
 | 
|---|
| 718 |                                                                 {
 | 
|---|
| 719 |                                                                         $c = $decomp_map[$utf_char][$_pos];
 | 
|---|
| 720 |                                                                         $_utf_len =& $utf_len_mask[$c & "\xF0"];
 | 
|---|
| 721 | 
 | 
|---|
| 722 |                                                                         if (isset($_utf_len))
 | 
|---|
| 723 |                                                                         {
 | 
|---|
| 724 |                                                                                 $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 | 
|---|
| 725 |                                                                                 $_pos += $_utf_len;
 | 
|---|
| 726 |                                                                         }
 | 
|---|
| 727 |                                                                         else
 | 
|---|
| 728 |                                                                         {
 | 
|---|
| 729 |                                                                                 $utf_seq[] = $c;
 | 
|---|
| 730 |                                                                                 ++$_pos;
 | 
|---|
| 731 |                                                                         }
 | 
|---|
| 732 |                                                                 }
 | 
|---|
| 733 |                                                                 while ($_pos < $_len);
 | 
|---|
| 734 |                                                         }
 | 
|---|
| 735 |                                                         else
 | 
|---|
| 736 |                                                         {
 | 
|---|
| 737 |                                                                 $utf_seq[] = $utf_char;
 | 
|---|
| 738 |                                                         }
 | 
|---|
| 739 | 
 | 
|---|
| 740 |                                                         $pos += $utf_len;
 | 
|---|
| 741 |                                                 }
 | 
|---|
| 742 |                                                 else
 | 
|---|
| 743 |                                                 {
 | 
|---|
| 744 |                                                         // Combining class 0 and no QC, break out of the loop
 | 
|---|
| 745 |                                                         // Note: we do not know if that character is valid. If it's not, the next iteration will replace it
 | 
|---|
| 746 |                                                         break;
 | 
|---|
| 747 |                                                 }
 | 
|---|
| 748 |                                         }
 | 
|---|
| 749 |                                         else
 | 
|---|
| 750 |                                         {
 | 
|---|
| 751 |                                                 // ASCII chars are starters
 | 
|---|
| 752 |                                                 break;
 | 
|---|
| 753 |                                         }
 | 
|---|
| 754 |                                 }
 | 
|---|
| 755 | 
 | 
|---|
| 756 | 
 | 
|---|
| 757 |                                 // STEP 4: Sort and combine
 | 
|---|
| 758 | 
 | 
|---|
| 759 |                                 // Here we sort...
 | 
|---|
| 760 |                                 $k_max = $k + sizeof($utf_seq);
 | 
|---|
| 761 | 
 | 
|---|
| 762 |                                 if (!$k && $k_max == 1)
 | 
|---|
| 763 |                                 {
 | 
|---|
| 764 |                                         // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop
 | 
|---|
| 765 |                                                 // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases
 | 
|---|
| 766 | //                                              if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))
 | 
|---|
| 767 | //                                              {
 | 
|---|
| 768 |                                                 $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];
 | 
|---|
| 769 |                                                 $tmp_pos = $pos;
 | 
|---|
| 770 | //                                              }
 | 
|---|
| 771 | 
 | 
|---|
| 772 |                                         continue;
 | 
|---|
| 773 |                                 }
 | 
|---|
| 774 | 
 | 
|---|
| 775 |                                 // ...there we combine
 | 
|---|
| 776 |                                 if (isset($utf_combining_class[$utf_seq[$k]]))
 | 
|---|
| 777 |                                 {
 | 
|---|
| 778 |                                         $starter = $nf_seq = '';
 | 
|---|
| 779 |                                 }
 | 
|---|
| 780 |                                 else
 | 
|---|
| 781 |                                 {
 | 
|---|
| 782 |                                         $starter = $utf_seq[$k++];
 | 
|---|
| 783 |                                         $nf_seq = '';
 | 
|---|
| 784 |                                 }
 | 
|---|
| 785 |                                 $utf_sort = array();
 | 
|---|
| 786 | 
 | 
|---|
| 787 |                                 // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine
 | 
|---|
| 788 |                                 // at the end of the string without altering it
 | 
|---|
| 789 |                                 $utf_seq[] = '';
 | 
|---|
| 790 | 
 | 
|---|
| 791 |                                 do
 | 
|---|
| 792 |                                 {
 | 
|---|
| 793 |                                         $utf_char = $utf_seq[$k++];
 | 
|---|
| 794 | 
 | 
|---|
| 795 |                                         if (isset($utf_combining_class[$utf_char]))
 | 
|---|
| 796 |                                         {
 | 
|---|
| 797 |                                                 $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
 | 
|---|
| 798 |                                         }
 | 
|---|
| 799 |                                         else
 | 
|---|
| 800 |                                         {
 | 
|---|
| 801 |                                                 if (empty($utf_sort))
 | 
|---|
| 802 |                                                 {
 | 
|---|
| 803 |                                                         // No combining characters... check for a composite of the two starters
 | 
|---|
| 804 |                                                         if (isset($utf_canonical_comp[$starter . $utf_char]))
 | 
|---|
| 805 |                                                         {
 | 
|---|
| 806 |                                                                 // Good ol' composite character
 | 
|---|
| 807 |                                                                 $starter = $utf_canonical_comp[$starter . $utf_char];
 | 
|---|
| 808 |                                                         }
 | 
|---|
| 809 |                                                         else if (isset($utf_jamo_type[$utf_char]))
 | 
|---|
| 810 |                                                         {
 | 
|---|
| 811 |                                                                 // Current char is a composable jamo
 | 
|---|
| 812 |                                                                 if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)
 | 
|---|
| 813 |                                                                 {
 | 
|---|
| 814 |                                                                         // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo
 | 
|---|
| 815 |                                                                         if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)
 | 
|---|
| 816 |                                                                         {
 | 
|---|
| 817 |                                                                                 // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)
 | 
|---|
| 818 |                                                                                 $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];
 | 
|---|
| 819 |                                                                                 ++$k;
 | 
|---|
| 820 |                                                                         }
 | 
|---|
| 821 |                                                                         else
 | 
|---|
| 822 |                                                                         {
 | 
|---|
| 823 |                                                                                 // L+V jamos, combine to a LV Hangul syllable
 | 
|---|
| 824 |                                                                                 $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];
 | 
|---|
| 825 |                                                                         }
 | 
|---|
| 826 | 
 | 
|---|
| 827 |                                                                         $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
 | 
|---|
| 828 |                                                                 }
 | 
|---|
| 829 |                                                                 else
 | 
|---|
| 830 |                                                                 {
 | 
|---|
| 831 |                                                                         // Non-composable jamo, just add it to the sequence
 | 
|---|
| 832 |                                                                         $nf_seq .= $starter;
 | 
|---|
| 833 |                                                                         $starter = $utf_char;
 | 
|---|
| 834 |                                                                 }
 | 
|---|
| 835 |                                                         }
 | 
|---|
| 836 |                                                         else
 | 
|---|
| 837 |                                                         {
 | 
|---|
| 838 |                                                                 // No composite, just add the first starter to the sequence then continue with the other one
 | 
|---|
| 839 |                                                                 $nf_seq .= $starter;
 | 
|---|
| 840 |                                                                 $starter = $utf_char;
 | 
|---|
| 841 |                                                         }
 | 
|---|
| 842 |                                                 }
 | 
|---|
| 843 |                                                 else
 | 
|---|
| 844 |                                                 {
 | 
|---|
| 845 |                                                         ksort($utf_sort);
 | 
|---|
| 846 | 
 | 
|---|
| 847 |                                                         // For each class of combining characters
 | 
|---|
| 848 |                                                         foreach ($utf_sort as $cc => $utf_chars)
 | 
|---|
| 849 |                                                         {
 | 
|---|
| 850 |                                                                 $j = 0;
 | 
|---|
| 851 | 
 | 
|---|
| 852 |                                                                 do
 | 
|---|
| 853 |                                                                 {
 | 
|---|
| 854 |                                                                         // Look for a composite
 | 
|---|
| 855 |                                                                         if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))
 | 
|---|
| 856 |                                                                         {
 | 
|---|
| 857 |                                                                                 // Found a composite, replace the starter
 | 
|---|
| 858 |                                                                                 $starter = $utf_canonical_comp[$starter . $utf_chars[$j]];
 | 
|---|
| 859 |                                                                                 unset($utf_sort[$cc][$j]);
 | 
|---|
| 860 |                                                                         }
 | 
|---|
| 861 |                                                                         else
 | 
|---|
| 862 |                                                                         {
 | 
|---|
| 863 |                                                                                 // No composite, all following characters in that class are blocked
 | 
|---|
| 864 |                                                                                 break;
 | 
|---|
| 865 |                                                                         }
 | 
|---|
| 866 |                                                                 }
 | 
|---|
| 867 |                                                                 while (isset($utf_sort[$cc][++$j]));
 | 
|---|
| 868 |                                                         }
 | 
|---|
| 869 | 
 | 
|---|
| 870 |                                                         // Add the starter to the normalized sequence, followed by non-starters in canonical order
 | 
|---|
| 871 |                                                         $nf_seq .= $starter;
 | 
|---|
| 872 | 
 | 
|---|
| 873 |                                                         foreach ($utf_sort as $utf_chars)
 | 
|---|
| 874 |                                                         {
 | 
|---|
| 875 |                                                                 if (!empty($utf_chars))
 | 
|---|
| 876 |                                                                 {
 | 
|---|
| 877 |                                                                         $nf_seq .= implode('', $utf_chars);
 | 
|---|
| 878 |                                                                 }
 | 
|---|
| 879 |                                                         }
 | 
|---|
| 880 | 
 | 
|---|
| 881 |                                                         // Reset the array and go on
 | 
|---|
| 882 |                                                         $utf_sort = array();
 | 
|---|
| 883 |                                                         $starter = $utf_char;
 | 
|---|
| 884 |                                                 }
 | 
|---|
| 885 |                                         }
 | 
|---|
| 886 |                                 }
 | 
|---|
| 887 |                                 while ($k <= $k_max);
 | 
|---|
| 888 | 
 | 
|---|
| 889 |                                 $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;
 | 
|---|
| 890 |                                 $tmp_pos = $pos;
 | 
|---|
| 891 |                         }
 | 
|---|
| 892 |                         else
 | 
|---|
| 893 |                         {
 | 
|---|
| 894 |                                 // Only a ASCII char can make the program get here
 | 
|---|
| 895 |                                 //
 | 
|---|
| 896 |                                 // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().
 | 
|---|
| 897 |                                 //
 | 
|---|
| 898 |                                 // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on
 | 
|---|
| 899 |                                 // multi-byte text (where the only ASCII chars are spaces and punctuation)
 | 
|---|
| 900 |                                 if (++$pos != $len)
 | 
|---|
| 901 |                                 {
 | 
|---|
| 902 |                                         if ($str[$pos] < "\x80")
 | 
|---|
| 903 |                                         {
 | 
|---|
| 904 |                                                 $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
 | 
|---|
| 905 |                                                 $buffer[++$i & 7] = $str[$pos - 1];
 | 
|---|
| 906 |                                         }
 | 
|---|
| 907 |                                         else
 | 
|---|
| 908 |                                         {
 | 
|---|
| 909 |                                                 $buffer[++$i & 7] = $c;
 | 
|---|
| 910 |                                         }
 | 
|---|
| 911 |                                 }
 | 
|---|
| 912 |                         }
 | 
|---|
| 913 |                 }
 | 
|---|
| 914 |                 while ($pos < $len);
 | 
|---|
| 915 | 
 | 
|---|
| 916 |                 // Now is time to return the string
 | 
|---|
| 917 |                 if ($tmp_pos)
 | 
|---|
| 918 |                 {
 | 
|---|
| 919 |                         // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version
 | 
|---|
| 920 |                         if ($tmp_pos == $len)
 | 
|---|
| 921 |                         {
 | 
|---|
| 922 |                                 // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
 | 
|---|
| 923 |                                 return $tmp;
 | 
|---|
| 924 |                         }
 | 
|---|
| 925 |                         else
 | 
|---|
| 926 |                         {
 | 
|---|
| 927 |                                 // The rightmost chunk of $str has not been appended to $tmp yet
 | 
|---|
| 928 |                                 return $tmp . substr($str, $tmp_pos);
 | 
|---|
| 929 |                         }
 | 
|---|
| 930 |                 }
 | 
|---|
| 931 | 
 | 
|---|
| 932 |                 // The string was already in normal form
 | 
|---|
| 933 |                 return $str;
 | 
|---|
| 934 |         }
 | 
|---|
| 935 | 
 | 
|---|
| 936 |         /**
 | 
|---|
| 937 |         * Decompose a UTF string
 | 
|---|
| 938 |         *
 | 
|---|
| 939 |         * @param        string  $str                    UTF string
 | 
|---|
| 940 |         * @param        integer $pos                    Position of the first UTF char (in bytes)
 | 
|---|
| 941 |         * @param        integer $len                    Length of the string (in bytes)
 | 
|---|
| 942 |         * @param        array   &$decomp_map    Decomposition mapping, passed by reference but never modified
 | 
|---|
| 943 |         * @return       string                                  The string, decomposed and sorted canonically
 | 
|---|
| 944 |         *
 | 
|---|
| 945 |         * @access       private
 | 
|---|
| 946 |         */
 | 
|---|
| 947 |         function decompose($str, $pos, $len, &$decomp_map)
 | 
|---|
| 948 |         {
 | 
|---|
| 949 |                 global $utf_combining_class;
 | 
|---|
| 950 | 
 | 
|---|
| 951 |                 // Load some commonly-used tables
 | 
|---|
| 952 |                 if (!isset($utf_combining_class))
 | 
|---|
| 953 |                 {
 | 
|---|
| 954 |                         global $phpbb_root_path, $phpEx;
 | 
|---|
| 955 |                         include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
 | 
|---|
| 956 |                 }
 | 
|---|
| 957 | 
 | 
|---|
| 958 |                 // UTF char length array
 | 
|---|
| 959 |                 $utf_len_mask = array(
 | 
|---|
| 960 |                         // Leading bytes masks
 | 
|---|
| 961 |                         "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
 | 
|---|
| 962 |                         // Trailing bytes masks
 | 
|---|
| 963 |                         "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
 | 
|---|
| 964 |                 );
 | 
|---|
| 965 | 
 | 
|---|
| 966 |                 // Some extra checks are triggered on the first byte of a UTF sequence
 | 
|---|
| 967 |                 $extra_check = array(
 | 
|---|
| 968 |                         "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
 | 
|---|
| 969 |                         "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
 | 
|---|
| 970 |                         "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
 | 
|---|
| 971 |                 );
 | 
|---|
| 972 | 
 | 
|---|
| 973 |                 // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:
 | 
|---|
| 974 |                 //   - 2-byte: 110? ???? 10?? ????
 | 
|---|
| 975 |                 //   - 3-byte: 1110 ???? 10?? ???? 10?? ????
 | 
|---|
| 976 |                 //   - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
 | 
|---|
| 977 |                 // Note that 5- and 6- byte sequences are automatically discarded
 | 
|---|
| 978 |                 $utf_validation_mask = array(
 | 
|---|
| 979 |                         2       => "\xE0\xC0",
 | 
|---|
| 980 |                         3       => "\xF0\xC0\xC0",
 | 
|---|
| 981 |                         4       => "\xF8\xC0\xC0\xC0"
 | 
|---|
| 982 |                 );
 | 
|---|
| 983 | 
 | 
|---|
| 984 |                 $utf_validation_check = array(
 | 
|---|
| 985 |                         2       => "\xC0\x80",
 | 
|---|
| 986 |                         3       => "\xE0\x80\x80",
 | 
|---|
| 987 |                         4       => "\xF0\x80\x80\x80"
 | 
|---|
| 988 |                 );
 | 
|---|
| 989 | 
 | 
|---|
| 990 |                 $tmp = '';
 | 
|---|
| 991 |                 $starter_pos = $pos;
 | 
|---|
| 992 |                 $tmp_pos = $last_cc = $sort = $dump = 0;
 | 
|---|
| 993 |                 $utf_sort = array();
 | 
|---|
| 994 | 
 | 
|---|
| 995 | 
 | 
|---|
| 996 |                 // Main loop
 | 
|---|
| 997 |                 do
 | 
|---|
| 998 |                 {
 | 
|---|
| 999 |                         // STEP 0: Capture the current char
 | 
|---|
| 1000 | 
 | 
|---|
| 1001 |                         $cur_mask = $str[$pos] & "\xF0";
 | 
|---|
| 1002 |                         if (isset($utf_len_mask[$cur_mask]))
 | 
|---|
| 1003 |                         {
 | 
|---|
| 1004 |                                 if ($utf_len = $utf_len_mask[$cur_mask])
 | 
|---|
| 1005 |                                 {
 | 
|---|
| 1006 |                                         // Multibyte char
 | 
|---|
| 1007 |                                         $utf_char = substr($str, $pos, $utf_len);
 | 
|---|
| 1008 |                                         $pos += $utf_len;
 | 
|---|
| 1009 |                                 }
 | 
|---|
| 1010 |                                 else
 | 
|---|
| 1011 |                                 {
 | 
|---|
| 1012 |                                         // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode
 | 
|---|
| 1013 |                                         // replacement char and we will advance the cursor
 | 
|---|
| 1014 |                                         $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
 | 
|---|
| 1015 | 
 | 
|---|
| 1016 |                                         if ($dump)
 | 
|---|
| 1017 |                                         {
 | 
|---|
| 1018 |                                                 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
 | 
|---|
| 1019 | 
 | 
|---|
| 1020 |                                                 // Dump combiners
 | 
|---|
| 1021 |                                                 if (!empty($utf_sort))
 | 
|---|
| 1022 |                                                 {
 | 
|---|
| 1023 |                                                         if ($sort)
 | 
|---|
| 1024 |                                                         {
 | 
|---|
| 1025 |                                                                 ksort($utf_sort);
 | 
|---|
| 1026 |                                                         }
 | 
|---|
| 1027 | 
 | 
|---|
| 1028 |                                                         foreach ($utf_sort as $utf_chars)
 | 
|---|
| 1029 |                                                         {
 | 
|---|
| 1030 |                                                                 $tmp .= implode('', $utf_chars);
 | 
|---|
| 1031 |                                                         }
 | 
|---|
| 1032 |                                                 }
 | 
|---|
| 1033 | 
 | 
|---|
| 1034 |                                                 $tmp .= str_repeat(UTF8_REPLACEMENT, $spn);
 | 
|---|
| 1035 |                                                 $dump = $sort = 0;
 | 
|---|
| 1036 |                                         }
 | 
|---|
| 1037 |                                         else
 | 
|---|
| 1038 |                                         {
 | 
|---|
| 1039 |                                                 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
 | 
|---|
| 1040 |                                         }
 | 
|---|
| 1041 | 
 | 
|---|
| 1042 |                                         $pos += $spn;
 | 
|---|
| 1043 |                                         $tmp_pos = $starter_pos = $pos;
 | 
|---|
| 1044 | 
 | 
|---|
| 1045 |                                         $utf_sort = array();
 | 
|---|
| 1046 |                                         $last_cc = 0;
 | 
|---|
| 1047 | 
 | 
|---|
| 1048 |                                         continue;
 | 
|---|
| 1049 |                                 }
 | 
|---|
| 1050 | 
 | 
|---|
| 1051 | 
 | 
|---|
| 1052 |                                 // STEP 1: Decide what to do with current char
 | 
|---|
| 1053 | 
 | 
|---|
| 1054 |                                 // Now, in that order:
 | 
|---|
| 1055 |                                 //  - check if that character is decomposable
 | 
|---|
| 1056 |                                 //  - check if that character is a non-starter
 | 
|---|
| 1057 |                                 //  - check if that character requires extra checks to be performed
 | 
|---|
| 1058 |                                 if (isset($decomp_map[$utf_char]))
 | 
|---|
| 1059 |                                 {
 | 
|---|
| 1060 |                                         // Decompose the char
 | 
|---|
| 1061 |                                         $_pos = 0;
 | 
|---|
| 1062 |                                         $_len = strlen($decomp_map[$utf_char]);
 | 
|---|
| 1063 | 
 | 
|---|
| 1064 |                                         do
 | 
|---|
| 1065 |                                         {
 | 
|---|
| 1066 |                                                 $c = $decomp_map[$utf_char][$_pos];
 | 
|---|
| 1067 |                                                 $_utf_len =& $utf_len_mask[$c & "\xF0"];
 | 
|---|
| 1068 | 
 | 
|---|
| 1069 |                                                 if (isset($_utf_len))
 | 
|---|
| 1070 |                                                 {
 | 
|---|
| 1071 |                                                         $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 | 
|---|
| 1072 |                                                         $_pos += $_utf_len;
 | 
|---|
| 1073 | 
 | 
|---|
| 1074 |                                                         if (isset($utf_combining_class[$_utf_char]))
 | 
|---|
| 1075 |                                                         {
 | 
|---|
| 1076 |                                                                 // The character decomposed to a non-starter, buffer it for sorting
 | 
|---|
| 1077 |                                                                 $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;
 | 
|---|
| 1078 | 
 | 
|---|
| 1079 |                                                                 if ($utf_combining_class[$_utf_char] < $last_cc)
 | 
|---|
| 1080 |                                                                 {
 | 
|---|
| 1081 |                                                                         // Not canonically ordered, will require sorting
 | 
|---|
| 1082 |                                                                         $sort = $dump = 1;
 | 
|---|
| 1083 |                                                                 }
 | 
|---|
| 1084 |                                                                 else
 | 
|---|
| 1085 |                                                                 {
 | 
|---|
| 1086 |                                                                         $dump = 1;
 | 
|---|
| 1087 |                                                                         $last_cc = $utf_combining_class[$_utf_char];
 | 
|---|
| 1088 |                                                                 }
 | 
|---|
| 1089 |                                                         }
 | 
|---|
| 1090 |                                                         else
 | 
|---|
| 1091 |                                                         {
 | 
|---|
| 1092 |                                                                 // This character decomposition contains a starter, dump the buffer and continue
 | 
|---|
| 1093 |                                                                 if ($dump)
 | 
|---|
| 1094 |                                                                 {
 | 
|---|
| 1095 |                                                                         $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
 | 
|---|
| 1096 | 
 | 
|---|
| 1097 |                                                                         // Dump combiners
 | 
|---|
| 1098 |                                                                         if (!empty($utf_sort))
 | 
|---|
| 1099 |                                                                         {
 | 
|---|
| 1100 |                                                                                 if ($sort)
 | 
|---|
| 1101 |                                                                                 {
 | 
|---|
| 1102 |                                                                                         ksort($utf_sort);
 | 
|---|
| 1103 |                                                                                 }
 | 
|---|
| 1104 | 
 | 
|---|
| 1105 |                                                                                 foreach ($utf_sort as $utf_chars)
 | 
|---|
| 1106 |                                                                                 {
 | 
|---|
| 1107 |                                                                                         $tmp .= implode('', $utf_chars);
 | 
|---|
| 1108 |                                                                                 }
 | 
|---|
| 1109 |                                                                         }
 | 
|---|
| 1110 | 
 | 
|---|
| 1111 |                                                                         $tmp .= $_utf_char;
 | 
|---|
| 1112 |                                                                         $dump = $sort = 0;
 | 
|---|
| 1113 |                                                                 }
 | 
|---|
| 1114 |                                                                 else
 | 
|---|
| 1115 |                                                                 {
 | 
|---|
| 1116 |                                                                         $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;
 | 
|---|
| 1117 |                                                                 }
 | 
|---|
| 1118 | 
 | 
|---|
| 1119 |                                                                 $tmp_pos = $starter_pos = $pos;
 | 
|---|
| 1120 |                                                                 $utf_sort = array();
 | 
|---|
| 1121 |                                                                 $last_cc = 0;
 | 
|---|
| 1122 |                                                         }
 | 
|---|
| 1123 |                                                 }
 | 
|---|
| 1124 |                                                 else
 | 
|---|
| 1125 |                                                 {
 | 
|---|
| 1126 |                                                         // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue
 | 
|---|
| 1127 |                                                         ++$_pos;
 | 
|---|
| 1128 | 
 | 
|---|
| 1129 |                                                         if ($dump)
 | 
|---|
| 1130 |                                                         {
 | 
|---|
| 1131 |                                                                 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
 | 
|---|
| 1132 | 
 | 
|---|
| 1133 |                                                                 // Dump combiners
 | 
|---|
| 1134 |                                                                 if (!empty($utf_sort))
 | 
|---|
| 1135 |                                                                 {
 | 
|---|
| 1136 |                                                                         if ($sort)
 | 
|---|
| 1137 |                                                                         {
 | 
|---|
| 1138 |                                                                                 ksort($utf_sort);
 | 
|---|
| 1139 |                                                                         }
 | 
|---|
| 1140 | 
 | 
|---|
| 1141 |                                                                         foreach ($utf_sort as $utf_chars)
 | 
|---|
| 1142 |                                                                         {
 | 
|---|
| 1143 |                                                                                 $tmp .= implode('', $utf_chars);
 | 
|---|
| 1144 |                                                                         }
 | 
|---|
| 1145 |                                                                 }
 | 
|---|
| 1146 | 
 | 
|---|
| 1147 |                                                                 $tmp .= $c;
 | 
|---|
| 1148 |                                                                 $dump = $sort = 0;
 | 
|---|
| 1149 |                                                         }
 | 
|---|
| 1150 |                                                         else
 | 
|---|
| 1151 |                                                         {
 | 
|---|
| 1152 |                                                                 $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;
 | 
|---|
| 1153 |                                                         }
 | 
|---|
| 1154 | 
 | 
|---|
| 1155 |                                                         $tmp_pos = $starter_pos = $pos;
 | 
|---|
| 1156 |                                                         $utf_sort = array();
 | 
|---|
| 1157 |                                                         $last_cc = 0;
 | 
|---|
| 1158 |                                                 }
 | 
|---|
| 1159 |                                         }
 | 
|---|
| 1160 |                                         while ($_pos < $_len);
 | 
|---|
| 1161 |                                 }
 | 
|---|
| 1162 |                                 else if (isset($utf_combining_class[$utf_char]))
 | 
|---|
| 1163 |                                 {
 | 
|---|
| 1164 |                                         // Combining character
 | 
|---|
| 1165 |                                         if ($utf_combining_class[$utf_char] < $last_cc)
 | 
|---|
| 1166 |                                         {
 | 
|---|
| 1167 |                                                 // Not in canonical order
 | 
|---|
| 1168 |                                                 $sort = $dump = 1;
 | 
|---|
| 1169 |                                         }
 | 
|---|
| 1170 |                                         else
 | 
|---|
| 1171 |                                         {
 | 
|---|
| 1172 |                                                 $last_cc = $utf_combining_class[$utf_char];
 | 
|---|
| 1173 |                                         }
 | 
|---|
| 1174 | 
 | 
|---|
| 1175 |                                         $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
 | 
|---|
| 1176 |                                 }
 | 
|---|
| 1177 |                                 else
 | 
|---|
| 1178 |                                 {
 | 
|---|
| 1179 |                                         // Non-decomposable starter, check out if it's a Hangul syllable
 | 
|---|
| 1180 |                                         if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST)
 | 
|---|
| 1181 |                                         {
 | 
|---|
| 1182 |                                                 // Nope, regular UTF char, check that we have the correct number of trailing bytes
 | 
|---|
| 1183 |                                                 if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
 | 
|---|
| 1184 |                                                 {
 | 
|---|
| 1185 |                                                         // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
 | 
|---|
| 1186 |                                                         // has been encoded in a five- or six- byte sequence.
 | 
|---|
| 1187 |                                                         // Move the cursor back to its original position then advance it to the position it should really be at
 | 
|---|
| 1188 |                                                         $pos -= $utf_len;
 | 
|---|
| 1189 |                                                         $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
 | 
|---|
| 1190 | 
 | 
|---|
| 1191 |                                                         if (!empty($utf_sort))
 | 
|---|
| 1192 |                                                         {
 | 
|---|
| 1193 |                                                                 ksort($utf_sort);
 | 
|---|
| 1194 | 
 | 
|---|
| 1195 |                                                                 foreach ($utf_sort as $utf_chars)
 | 
|---|
| 1196 |                                                                 {
 | 
|---|
| 1197 |                                                                         $tmp .= implode('', $utf_chars);
 | 
|---|
| 1198 |                                                                 }
 | 
|---|
| 1199 |                                                                 $utf_sort = array();
 | 
|---|
| 1200 |                                                         }
 | 
|---|
| 1201 | 
 | 
|---|
| 1202 |                                                         // Add a replacement char then another replacement char for every trailing byte.
 | 
|---|
| 1203 |                                                         //
 | 
|---|
| 1204 |                                                         // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this
 | 
|---|
| 1205 |                                                         $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);
 | 
|---|
| 1206 |                                                         $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);
 | 
|---|
| 1207 | 
 | 
|---|
| 1208 |                                                         $dump = $sort = 0;
 | 
|---|
| 1209 | 
 | 
|---|
| 1210 |                                                         $pos += $spn;
 | 
|---|
| 1211 |                                                         $tmp_pos = $pos;
 | 
|---|
| 1212 |                                                         continue;
 | 
|---|
| 1213 |                                                 }
 | 
|---|
| 1214 | 
 | 
|---|
| 1215 |                                                 if (isset($extra_check[$utf_char[0]]))
 | 
|---|
| 1216 |                                                 {
 | 
|---|
| 1217 |                                                         switch ($utf_char[0])
 | 
|---|
| 1218 |                                                         {
 | 
|---|
| 1219 |                                                                 // Note: 0xED is quite common in Korean
 | 
|---|
| 1220 |                                                                 case "\xED":
 | 
|---|
| 1221 |                                                                         if ($utf_char >= "\xED\xA0\x80")
 | 
|---|
| 1222 |                                                                         {
 | 
|---|
| 1223 |                                                                                 // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
 | 
|---|
| 1224 |                                                                                 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
 | 
|---|
| 1225 | 
 | 
|---|
| 1226 |                                                                                 if (!empty($utf_sort))
 | 
|---|
| 1227 |                                                                                 {
 | 
|---|
| 1228 |                                                                                         ksort($utf_sort);
 | 
|---|
| 1229 | 
 | 
|---|
| 1230 |                                                                                         foreach ($utf_sort as $utf_chars)
 | 
|---|
| 1231 |                                                                                         {
 | 
|---|
| 1232 |                                                                                                 $tmp .= implode('', $utf_chars);
 | 
|---|
| 1233 |                                                                                         }
 | 
|---|
| 1234 |                                                                                         $utf_sort = array();
 | 
|---|
| 1235 |                                                                                 }
 | 
|---|
| 1236 | 
 | 
|---|
| 1237 |                                                                                 $tmp .= UTF8_REPLACEMENT;
 | 
|---|
| 1238 |                                                                                 $dump = $sort = 0;
 | 
|---|
| 1239 | 
 | 
|---|
| 1240 |                                                                                 $tmp_pos = $starter_pos = $pos;
 | 
|---|
| 1241 |                                                                                 continue 2;
 | 
|---|
| 1242 |                                                                         }
 | 
|---|
| 1243 |                                                                 break;
 | 
|---|
| 1244 | 
 | 
|---|
| 1245 |                                                                 // Note: 0xEF is quite common in Japanese
 | 
|---|
| 1246 |                                                                 case "\xEF":
 | 
|---|
| 1247 |                                                                         if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
 | 
|---|
| 1248 |                                                                         {
 | 
|---|
| 1249 |                                                                                 // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
 | 
|---|
| 1250 |                                                                                 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
 | 
|---|
| 1251 | 
 | 
|---|
| 1252 |                                                                                 if (!empty($utf_sort))
 | 
|---|
| 1253 |                                                                                 {
 | 
|---|
| 1254 |                                                                                         ksort($utf_sort);
 | 
|---|
| 1255 | 
 | 
|---|
| 1256 |                                                                                         foreach ($utf_sort as $utf_chars)
 | 
|---|
| 1257 |                                                                                         {
 | 
|---|
| 1258 |                                                                                                 $tmp .= implode('', $utf_chars);
 | 
|---|
| 1259 |                                                                                         }
 | 
|---|
| 1260 |                                                                                         $utf_sort = array();
 | 
|---|
| 1261 |                                                                                 }
 | 
|---|
| 1262 | 
 | 
|---|
| 1263 |                                                                                 $tmp .= UTF8_REPLACEMENT;
 | 
|---|
| 1264 |                                                                                 $dump = $sort = 0;
 | 
|---|
| 1265 | 
 | 
|---|
| 1266 |                                                                                 $tmp_pos = $starter_pos = $pos;
 | 
|---|
| 1267 |                                                                                 continue 2;
 | 
|---|
| 1268 |                                                                         }
 | 
|---|
| 1269 |                                                                 break;
 | 
|---|
| 1270 | 
 | 
|---|
| 1271 |                                                                 case "\xC0":
 | 
|---|
| 1272 |                                                                 case "\xC1":
 | 
|---|
| 1273 |                                                                         if ($utf_char <= "\xC1\xBF")
 | 
|---|
| 1274 |                                                                         {
 | 
|---|
| 1275 |                                                                                 // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
 | 
|---|
| 1276 |                                                                                 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
 | 
|---|
| 1277 | 
 | 
|---|
| 1278 |                                                                                 if (!empty($utf_sort))
 | 
|---|
| 1279 |                                                                                 {
 | 
|---|
| 1280 |                                                                                         ksort($utf_sort);
 | 
|---|
| 1281 | 
 | 
|---|
| 1282 |                                                                                         foreach ($utf_sort as $utf_chars)
 | 
|---|
| 1283 |                                                                                         {
 | 
|---|
| 1284 |                                                                                                 $tmp .= implode('', $utf_chars);
 | 
|---|
| 1285 |                                                                                         }
 | 
|---|
| 1286 |                                                                                         $utf_sort = array();
 | 
|---|
| 1287 |                                                                                 }
 | 
|---|
| 1288 | 
 | 
|---|
| 1289 |                                                                                 $tmp .= UTF8_REPLACEMENT;
 | 
|---|
| 1290 |                                                                                 $dump = $sort = 0;
 | 
|---|
| 1291 | 
 | 
|---|
| 1292 |                                                                                 $tmp_pos = $starter_pos = $pos;
 | 
|---|
| 1293 |                                                                                 continue 2;
 | 
|---|
| 1294 |                                                                         }
 | 
|---|
| 1295 |                                                                 break;
 | 
|---|
| 1296 | 
 | 
|---|
| 1297 |                                                                 case "\xE0":
 | 
|---|
| 1298 |                                                                         if ($utf_char <= "\xE0\x9F\xBF")
 | 
|---|
| 1299 |                                                                         {
 | 
|---|
| 1300 |                                                                                 // Unicode char U+0000..U+07FF encoded in 3 bytes
 | 
|---|
| 1301 |                                                                                 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
 | 
|---|
| 1302 | 
 | 
|---|
| 1303 |                                                                                 if (!empty($utf_sort))
 | 
|---|
| 1304 |                                                                                 {
 | 
|---|
| 1305 |                                                                                         ksort($utf_sort);
 | 
|---|
| 1306 | 
 | 
|---|
| 1307 |                                                                                         foreach ($utf_sort as $utf_chars)
 | 
|---|
| 1308 |                                                                                         {
 | 
|---|
| 1309 |                                                                                                 $tmp .= implode('', $utf_chars);
 | 
|---|
| 1310 |                                                                                         }
 | 
|---|
| 1311 |                                                                                         $utf_sort = array();
 | 
|---|
| 1312 |                                                                                 }
 | 
|---|
| 1313 | 
 | 
|---|
| 1314 |                                                                                 $tmp .= UTF8_REPLACEMENT;
 | 
|---|
| 1315 |                                                                                 $dump = $sort = 0;
 | 
|---|
| 1316 | 
 | 
|---|
| 1317 |                                                                                 $tmp_pos = $starter_pos = $pos;
 | 
|---|
| 1318 |                                                                                 continue 2;
 | 
|---|
| 1319 |                                                                         }
 | 
|---|
| 1320 |                                                                 break;
 | 
|---|
| 1321 | 
 | 
|---|
| 1322 |                                                                 case "\xF0":
 | 
|---|
| 1323 |                                                                         if ($utf_char <= "\xF0\x8F\xBF\xBF")
 | 
|---|
| 1324 |                                                                         {
 | 
|---|
| 1325 |                                                                                 // Unicode char U+0000..U+FFFF encoded in 4 bytes
 | 
|---|
| 1326 |                                                                                 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
 | 
|---|
| 1327 | 
 | 
|---|
| 1328 |                                                                                 if (!empty($utf_sort))
 | 
|---|
| 1329 |                                                                                 {
 | 
|---|
| 1330 |                                                                                         ksort($utf_sort);
 | 
|---|
| 1331 | 
 | 
|---|
| 1332 |                                                                                         foreach ($utf_sort as $utf_chars)
 | 
|---|
| 1333 |                                                                                         {
 | 
|---|
| 1334 |                                                                                                 $tmp .= implode('', $utf_chars);
 | 
|---|
| 1335 |                                                                                         }
 | 
|---|
| 1336 |                                                                                         $utf_sort = array();
 | 
|---|
| 1337 |                                                                                 }
 | 
|---|
| 1338 | 
 | 
|---|
| 1339 |                                                                                 $tmp .= UTF8_REPLACEMENT;
 | 
|---|
| 1340 |                                                                                 $dump = $sort = 0;
 | 
|---|
| 1341 | 
 | 
|---|
| 1342 |                                                                                 $tmp_pos = $starter_pos = $pos;
 | 
|---|
| 1343 |                                                                                 continue 2;
 | 
|---|
| 1344 |                                                                         }
 | 
|---|
| 1345 |                                                                 break;
 | 
|---|
| 1346 | 
 | 
|---|
| 1347 |                                                                 default:
 | 
|---|
| 1348 |                                                                         if ($utf_char > UTF8_MAX)
 | 
|---|
| 1349 |                                                                         {
 | 
|---|
| 1350 |                                                                                 // Out of the Unicode range
 | 
|---|
| 1351 |                                                                                 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
 | 
|---|
| 1352 | 
 | 
|---|
| 1353 |                                                                                 if (!empty($utf_sort))
 | 
|---|
| 1354 |                                                                                 {
 | 
|---|
| 1355 |                                                                                         ksort($utf_sort);
 | 
|---|
| 1356 | 
 | 
|---|
| 1357 |                                                                                         foreach ($utf_sort as $utf_chars)
 | 
|---|
| 1358 |                                                                                         {
 | 
|---|
| 1359 |                                                                                                 $tmp .= implode('', $utf_chars);
 | 
|---|
| 1360 |                                                                                         }
 | 
|---|
| 1361 |                                                                                         $utf_sort = array();
 | 
|---|
| 1362 |                                                                                 }
 | 
|---|
| 1363 | 
 | 
|---|
| 1364 |                                                                                 $tmp .= UTF8_REPLACEMENT;
 | 
|---|
| 1365 |                                                                                 $dump = $sort = 0;
 | 
|---|
| 1366 | 
 | 
|---|
| 1367 |                                                                                 $tmp_pos = $starter_pos = $pos;
 | 
|---|
| 1368 |                                                                                 continue 2;
 | 
|---|
| 1369 |                                                                         }
 | 
|---|
| 1370 |                                                                 break;
 | 
|---|
| 1371 |                                                         }
 | 
|---|
| 1372 |                                                 }
 | 
|---|
| 1373 |                                         }
 | 
|---|
| 1374 |                                         else
 | 
|---|
| 1375 |                                         {
 | 
|---|
| 1376 |                                                 // Hangul syllable
 | 
|---|
| 1377 |                                                 $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;
 | 
|---|
| 1378 | 
 | 
|---|
| 1379 |                                                 // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
 | 
|---|
| 1380 |                                                 //
 | 
|---|
| 1381 |                                                 // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
 | 
|---|
| 1382 |                                                 if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)
 | 
|---|
| 1383 |                                                 {
 | 
|---|
| 1384 |                                                         if ($t_index < 25)
 | 
|---|
| 1385 |                                                         {
 | 
|---|
| 1386 |                                                                 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
 | 
|---|
| 1387 |                                                                 $utf_char[8] = chr(0xA7 + $t_index);
 | 
|---|
| 1388 |                                                         }
 | 
|---|
| 1389 |                                                         else
 | 
|---|
| 1390 |                                                         {
 | 
|---|
| 1391 |                                                                 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
 | 
|---|
| 1392 |                                                                 $utf_char[8] = chr(0x67 + $t_index);
 | 
|---|
| 1393 |                                                         }
 | 
|---|
| 1394 |                                                 }
 | 
|---|
| 1395 |                                                 else
 | 
|---|
| 1396 |                                                 {
 | 
|---|
| 1397 |                                                         $utf_char = "\xE1\x84\x00\xE1\x85\x00";
 | 
|---|
| 1398 |                                                 }
 | 
|---|
| 1399 | 
 | 
|---|
| 1400 |                                                 $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));
 | 
|---|
| 1401 |                                                 $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));
 | 
|---|
| 1402 | 
 | 
|---|
| 1403 |                                                 // Just like other decompositions, the resulting Jamos must be dumped to the tmp string
 | 
|---|
| 1404 |                                                 $dump = 1;
 | 
|---|
| 1405 |                                         }
 | 
|---|
| 1406 | 
 | 
|---|
| 1407 |                                         // Do we need to dump stuff to the tmp string?
 | 
|---|
| 1408 |                                         if ($dump)
 | 
|---|
| 1409 |                                         {
 | 
|---|
| 1410 |                                                 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
 | 
|---|
| 1411 | 
 | 
|---|
| 1412 |                                                 // Dump combiners
 | 
|---|
| 1413 |                                                 if (!empty($utf_sort))
 | 
|---|
| 1414 |                                                 {
 | 
|---|
| 1415 |                                                         if ($sort)
 | 
|---|
| 1416 |                                                         {
 | 
|---|
| 1417 |                                                                 ksort($utf_sort);
 | 
|---|
| 1418 |                                                         }
 | 
|---|
| 1419 | 
 | 
|---|
| 1420 |                                                         foreach ($utf_sort as $utf_chars)
 | 
|---|
| 1421 |                                                         {
 | 
|---|
| 1422 |                                                                 $tmp .= implode('', $utf_chars);
 | 
|---|
| 1423 |                                                         }
 | 
|---|
| 1424 |                                                 }
 | 
|---|
| 1425 | 
 | 
|---|
| 1426 |                                                 $tmp .= $utf_char;
 | 
|---|
| 1427 |                                                 $dump = $sort = 0;
 | 
|---|
| 1428 |                                                 $tmp_pos = $pos;
 | 
|---|
| 1429 |                                         }
 | 
|---|
| 1430 | 
 | 
|---|
| 1431 |                                         $last_cc = 0;
 | 
|---|
| 1432 |                                         $utf_sort = array();
 | 
|---|
| 1433 |                                         $starter_pos = $pos;
 | 
|---|
| 1434 |                                 }
 | 
|---|
| 1435 |                         }
 | 
|---|
| 1436 |                         else
 | 
|---|
| 1437 |                         {
 | 
|---|
| 1438 |                                 // ASCII char, which happens to be a starter (as any other ASCII char)
 | 
|---|
| 1439 |                                 if ($dump)
 | 
|---|
| 1440 |                                 {
 | 
|---|
| 1441 |                                         $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
 | 
|---|
| 1442 | 
 | 
|---|
| 1443 |                                         // Dump combiners
 | 
|---|
| 1444 |                                         if (!empty($utf_sort))
 | 
|---|
| 1445 |                                         {
 | 
|---|
| 1446 |                                                 if ($sort)
 | 
|---|
| 1447 |                                                 {
 | 
|---|
| 1448 |                                                         ksort($utf_sort);
 | 
|---|
| 1449 |                                                 }
 | 
|---|
| 1450 | 
 | 
|---|
| 1451 |                                                 foreach ($utf_sort as $utf_chars)
 | 
|---|
| 1452 |                                                 {
 | 
|---|
| 1453 |                                                         $tmp .= implode('', $utf_chars);
 | 
|---|
| 1454 |                                                 }
 | 
|---|
| 1455 |                                         }
 | 
|---|
| 1456 | 
 | 
|---|
| 1457 |                                         $tmp .= $str[$pos];
 | 
|---|
| 1458 |                                         $dump = $sort = 0;
 | 
|---|
| 1459 |                                         $tmp_pos = ++$pos;
 | 
|---|
| 1460 | 
 | 
|---|
| 1461 |                                         $pos += strspn($str, UTF8_ASCII_RANGE, $pos);
 | 
|---|
| 1462 |                                 }
 | 
|---|
| 1463 |                                 else
 | 
|---|
| 1464 |                                 {
 | 
|---|
| 1465 |                                         $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
 | 
|---|
| 1466 |                                 }
 | 
|---|
| 1467 | 
 | 
|---|
| 1468 |                                 $last_cc = 0;
 | 
|---|
| 1469 |                                 $utf_sort = array();
 | 
|---|
| 1470 |                                 $starter_pos = $pos;
 | 
|---|
| 1471 |                         }
 | 
|---|
| 1472 |                 }
 | 
|---|
| 1473 |                 while ($pos < $len);
 | 
|---|
| 1474 | 
 | 
|---|
| 1475 |                 // Now is time to return the string
 | 
|---|
| 1476 |                 if ($dump)
 | 
|---|
| 1477 |                 {
 | 
|---|
| 1478 |                         $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
 | 
|---|
| 1479 | 
 | 
|---|
| 1480 |                         // Dump combiners
 | 
|---|
| 1481 |                         if (!empty($utf_sort))
 | 
|---|
| 1482 |                         {
 | 
|---|
| 1483 |                                 if ($sort)
 | 
|---|
| 1484 |                                 {
 | 
|---|
| 1485 |                                         ksort($utf_sort);
 | 
|---|
| 1486 |                                 }
 | 
|---|
| 1487 | 
 | 
|---|
| 1488 |                                 foreach ($utf_sort as $utf_chars)
 | 
|---|
| 1489 |                                 {
 | 
|---|
| 1490 |                                         $tmp .= implode('', $utf_chars);
 | 
|---|
| 1491 |                                 }
 | 
|---|
| 1492 |                         }
 | 
|---|
| 1493 | 
 | 
|---|
| 1494 |                         return $tmp;
 | 
|---|
| 1495 |                 }
 | 
|---|
| 1496 |                 else if ($tmp_pos)
 | 
|---|
| 1497 |                 {
 | 
|---|
| 1498 |                         // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version
 | 
|---|
| 1499 |                         if ($tmp_pos == $len)
 | 
|---|
| 1500 |                         {
 | 
|---|
| 1501 |                                 // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
 | 
|---|
| 1502 |                                 return $tmp;
 | 
|---|
| 1503 |                         }
 | 
|---|
| 1504 |                         else
 | 
|---|
| 1505 |                         {
 | 
|---|
| 1506 |                                 // The rightmost chunk of $str has not been appended to $tmp yet
 | 
|---|
| 1507 |                                 return $tmp . substr($str, $tmp_pos);
 | 
|---|
| 1508 |                         }
 | 
|---|
| 1509 |                 }
 | 
|---|
| 1510 | 
 | 
|---|
| 1511 |                 // The string was already in normal form
 | 
|---|
| 1512 |                 return $str;
 | 
|---|
| 1513 |         }
 | 
|---|
| 1514 | }
 | 
|---|
| 1515 | 
 | 
|---|
| 1516 | ?>
 | 
|---|