[ Index ] |
PHP Cross Reference of phpBB 3.0 Beta 3 |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * 4 * @package phpBB3 5 * @version $Id: utf_normalizer.php,v 1.6 2006/11/03 23:09:15 davidmj Exp $ 6 * @copyright (c) 2005 phpBB Group 7 * @license http://opensource.org/licenses/gpl-license.php GNU Public License 8 * 9 */ 10 11 /** 12 * Some Unicode characters encoded in UTF-8 13 * 14 * Preserved for compatibility 15 */ 16 define('UTF8_REPLACEMENT', "\xEF\xBF\xBD"); 17 define('UTF8_MAX', "\xF4\x8F\xBF\xBF"); 18 define('UTF8_FFFE', "\xEF\xBF\xBE"); 19 define('UTF8_FFFF', "\xEF\xBF\xBF"); 20 define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80"); 21 define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF"); 22 define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80"); 23 define('UTF8_HANGUL_LAST', "\xED\x9E\xA3"); 24 25 define('UTF8_CJK_FIRST', "\xE4\xB8\x80"); 26 define('UTF8_CJK_LAST', "\xE9\xBE\xBB"); 27 define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80"); 28 define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96"); 29 30 // Unset global variables 31 unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']); 32 33 // NFC_QC and NFKC_QC values 34 define('UNICODE_QC_MAYBE', 0); 35 define('UNICODE_QC_NO', 1); 36 37 // Contains all the ASCII characters appearing in UTF-8, sorted by frequency 38 define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"); 39 40 // Contains all the tail bytes that can appear in the composition of a UTF-8 char 41 define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A"); 42 43 // Constants used by the Hangul [de]composition algorithms 44 define('UNICODE_HANGUL_SBASE', 0xAC00); 45 define('UNICODE_HANGUL_LBASE', 0x1100); 46 define('UNICODE_HANGUL_VBASE', 0x1161); 47 define('UNICODE_HANGUL_TBASE', 0x11A7); 48 define('UNICODE_HANGUL_SCOUNT', 11172); 49 define('UNICODE_HANGUL_LCOUNT', 19); 50 define('UNICODE_HANGUL_VCOUNT', 21); 51 define('UNICODE_HANGUL_TCOUNT', 28); 52 define('UNICODE_HANGUL_NCOUNT', 588); 53 define('UNICODE_JAMO_L', 0); 54 define('UNICODE_JAMO_V', 1); 55 define('UNICODE_JAMO_T', 2); 56 57 /** 58 * Unicode normalization routines 59 * 60 * @package phpBB3 61 */ 62 class utf_normalizer 63 { 64 /** 65 * Validate, cleanup and normalize a string 66 * 67 * The ultimate convenience function! Clean up invalid UTF-8 sequences, 68 * and convert to Normal Form C, canonical composition. 69 * 70 * @param string $str The dirty string 71 * @return string The same string, all shiny and cleaned-up 72 */ 73 function cleanup($str) 74 { 75 // The string below is the list of all autorized characters, sorted by frequency in latin text 76 $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D"); 77 $len = strlen($str); 78 79 if ($pos == $len) 80 { 81 // ASCII strings with no special chars return immediately 82 return $str; 83 } 84 85 // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together 86 if (!isset($GLOBALS['utf_nfc_qc'])) 87 { 88 global $phpbb_root_path, $phpEx; 89 include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx); 90 } 91 92 // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t 93 // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char 94 return utf_normalizer::recompose( 95 strtr( 96 $str, 97 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F", 98 "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF" 99 ), 100 $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp'] 101 ); 102 } 103 104 /** 105 * Validate and normalize a UTF string to NFC 106 * 107 * @param string $str Unchecked UTF string 108 * @return string The string, validated and in normal form 109 */ 110 function nfc($str) 111 { 112 $pos = strspn($str, UTF8_ASCII_RANGE); 113 $len = strlen($str); 114 115 if ($pos == $len) 116 { 117 // ASCII strings return immediately 118 return $str; 119 } 120 121 if (!isset($GLOBALS['utf_nfc_qc'])) 122 { 123 global $phpbb_root_path, $phpEx; 124 include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx); 125 } 126 127 return utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']); 128 } 129 130 /** 131 * Validate and normalize a UTF string to NFKC 132 * 133 * @param string $str Unchecked UTF string 134 * @return string The string, validated and in normal form 135 */ 136 function nfkc($str) 137 { 138 $pos = strspn($str, UTF8_ASCII_RANGE); 139 $len = strlen($str); 140 141 if ($pos == $len) 142 { 143 // ASCII strings return immediately 144 return $str; 145 } 146 147 if (!isset($GLOBALS['utf_nfkc_qc'])) 148 { 149 global $phpbb_root_path, $phpEx; 150 include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx); 151 } 152 153 if (!isset($GLOBALS['utf_canonical_comp'])) 154 { 155 global $phpbb_root_path, $phpEx; 156 include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx); 157 } 158 159 return utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']); 160 } 161 162 /** 163 * Validate and normalize a UTF string to NFD 164 * 165 * @param string $str Unchecked UTF string 166 * @return string The string, validated and in normal form 167 */ 168 function nfd($str) 169 { 170 $pos = strspn($str, UTF8_ASCII_RANGE); 171 $len = strlen($str); 172 173 if ($pos == $len) 174 { 175 // ASCII strings return immediately 176 return $str; 177 } 178 179 if (!isset($GLOBALS['utf_canonical_decomp'])) 180 { 181 global $phpbb_root_path, $phpEx; 182 include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx); 183 } 184 185 return utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']); 186 } 187 188 /** 189 * Validate and normalize a UTF string to NFKD 190 * 191 * @param string $str Unchecked UTF string 192 * @return string The string, validated and in normal form 193 */ 194 function nfkd($str) 195 { 196 $pos = strspn($str, UTF8_ASCII_RANGE); 197 $len = strlen($str); 198 199 if ($pos == $len) 200 { 201 // ASCII strings return immediately 202 return $str; 203 } 204 205 if (!isset($GLOBALS['utf_compatibility_decomp'])) 206 { 207 global $phpbb_root_path, $phpEx; 208 include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx); 209 } 210 211 return utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']); 212 } 213 214 215 /** 216 * Recompose a UTF string 217 * 218 * @param string $str Unchecked UTF string 219 * @param integer $pos Position of the first UTF char (in bytes) 220 * @param integer $len Length of the string (in bytes) 221 * @param array $qc Quick-check array, passed by reference but never modified 222 * @param array $decomp_map Decomposition mapping, passed by reference but never modified 223 * @return string The string, validated and recomposed 224 * 225 * @access private 226 */ 227 function recompose($str, $pos, $len, &$qc, &$decomp_map) 228 { 229 global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index; 230 231 // Load some commonly-used tables 232 if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class)) 233 { 234 global $phpbb_root_path; 235 include ($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.php'); 236 } 237 238 // Buffer the last ASCII char before the UTF-8 stuff if applicable 239 $tmp = ''; 240 $i = $tmp_pos = $last_cc = 0; 241 242 if ($pos) 243 { 244 $buffer = array(++$i => $str[$pos - 1]); 245 } 246 else 247 { 248 $buffer = array(); 249 } 250 251 // UTF char length array 252 // This array is used to determine the length of a UTF character. 253 // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos 254 // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char. 255 // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character 256 // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte. 257 $utf_len_mask = array( 258 // Leading bytes masks 259 "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4, 260 // Trailing bytes masks 261 "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0 262 ); 263 264 $extra_check = array( 265 "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1, 266 "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1, 267 "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1 268 ); 269 270 $utf_validation_mask = array( 271 2 => "\xE0\xC0", 272 3 => "\xF0\xC0\xC0", 273 4 => "\xF8\xC0\xC0\xC0" 274 ); 275 276 $utf_validation_check = array( 277 2 => "\xC0\x80", 278 3 => "\xE0\x80\x80", 279 4 => "\xF0\x80\x80\x80" 280 ); 281 282 // Main loop 283 do 284 { 285 // STEP 0: Capture the current char and buffer it 286 $c = $str[$pos]; 287 $c_mask = $c & "\xF0"; 288 289 if (isset($utf_len_mask[$c_mask])) 290 { 291 // Byte at $pos is either a leading byte or a missplaced trailing byte 292 if ($utf_len = $utf_len_mask[$c_mask]) 293 { 294 // Capture the char 295 $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len); 296 297 // Let's find out if a thorough check is needed 298 if (isset($qc[$utf_char])) 299 { 300 // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block 301 } 302 else if (isset($utf_combining_class[$utf_char])) 303 { 304 if ($utf_combining_class[$utf_char] < $last_cc) 305 { 306 // A combining character that is NOT canonically ordered 307 } 308 else 309 { 310 // A combining character that IS canonically ordered, skip to the next char 311 $last_cc = $utf_combining_class[$utf_char]; 312 313 $pos += $utf_len; 314 continue; 315 } 316 } 317 else 318 { 319 // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character. 320 // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out 321 $last_cc = 0; 322 323 // Check that we have the correct number of trailing bytes 324 if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len]) 325 { 326 // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char 327 // has been encoded in a five- or six- byte sequence 328 if ($utf_char[0] >= "\xF8") 329 { 330 if ($utf_char[0] < "\xF8") 331 { 332 $trailing_bytes = 3; 333 } 334 else if ($utf_char[0] < "\xFC") 335 { 336 $trailing_bytes = 4; 337 } 338 339 if ($utf_char[0] > "\xFD") 340 { 341 $trailing_bytes = 0; 342 } 343 else 344 { 345 $trailing_bytes = 5; 346 } 347 } 348 else 349 { 350 $trailing_bytes = $utf_len - 1; 351 } 352 353 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT; 354 $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes); 355 $tmp_pos = $pos; 356 357 continue; 358 } 359 360 if (isset($extra_check[$c])) 361 { 362 switch ($c) 363 { 364 // Note: 0xED is quite common in Korean 365 case "\xED": 366 if ($utf_char >= "\xED\xA0\x80") 367 { 368 // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF) 369 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT; 370 $pos += $utf_len; 371 $tmp_pos = $pos; 372 continue 2; 373 } 374 break; 375 376 // Note: 0xEF is quite common in Japanese 377 case "\xEF": 378 if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF") 379 { 380 // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF) 381 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT; 382 $pos += $utf_len; 383 $tmp_pos = $pos; 384 continue 2; 385 } 386 break; 387 388 case "\xC0": 389 case "\xC1": 390 if ($utf_char <= "\xC1\xBF") 391 { 392 // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char 393 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT; 394 $pos += $utf_len; 395 $tmp_pos = $pos; 396 continue 2; 397 } 398 break; 399 400 case "\xE0": 401 if ($utf_char <= "\xE0\x9F\xBF") 402 { 403 // Unicode char U+0000..U+07FF encoded in 3 bytes 404 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT; 405 $pos += $utf_len; 406 $tmp_pos = $pos; 407 continue 2; 408 } 409 break; 410 411 case "\xF0": 412 if ($utf_char <= "\xF0\x8F\xBF\xBF") 413 { 414 // Unicode char U+0000..U+FFFF encoded in 4 bytes 415 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT; 416 $pos += $utf_len; 417 $tmp_pos = $pos; 418 continue 2; 419 } 420 break; 421 422 default: 423 // Five- and six- byte sequences do not need being checked for here anymore 424 if ($utf_char > UTF8_MAX) 425 { 426 // Out of the Unicode range 427 if ($utf_char[0] < "\xF8") 428 { 429 $trailing_bytes = 3; 430 } 431 else if ($utf_char[0] < "\xFC") 432 { 433 $trailing_bytes = 4; 434 } 435 else if ($utf_char[0] > "\xFD") 436 { 437 $trailing_bytes = 0; 438 } 439 else 440 { 441 $trailing_bytes = 5; 442 } 443 444 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT; 445 $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes); 446 $tmp_pos = $pos; 447 continue 2; 448 } 449 break; 450 } 451 } 452 453 // The char is a valid starter, move the cursor and go on 454 $pos += $utf_len; 455 continue; 456 } 457 } 458 else 459 { 460 // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if 461 // each of them was a Unicode replacement char 462 $spn = strspn($str, UTF8_TRAILING_BYTES, $pos); 463 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn); 464 465 $pos += $spn; 466 $tmp_pos = $pos; 467 continue; 468 } 469 470 471 // STEP 1: Decompose current char 472 473 // We have found a character that is either: 474 // - in the NFC_QC/NFKC_QC list 475 // - a non-starter char that is not canonically ordered 476 // 477 // We are going to capture the shortest UTF sequence that satisfies these two conditions: 478 // 479 // 1 - If the sequence does not start at the begginning of the string, it must begin with a starter, 480 // and that starter must not have the NF[K]C_QC property equal to "MAYBE" 481 // 482 // 2 - If the sequence does not end at the end of the string, it must end with a non-starter and be 483 // immediately followed by a starter that is not on the QC list 484 // 485 $utf_seq = array(); 486 $last_cc = 0; 487 $lpos = $pos; 488 $pos += $utf_len; 489 490 if (isset($decomp_map[$utf_char])) 491 { 492 $_pos = 0; 493 $_len = strlen($decomp_map[$utf_char]); 494 495 do 496 { 497 $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"]; 498 499 if (isset($_utf_len)) 500 { 501 $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len); 502 $_pos += $_utf_len; 503 } 504 else 505 { 506 $utf_seq[] = $decomp_map[$utf_char][$_pos]; 507 ++$_pos; 508 } 509 } 510 while ($_pos < $_len); 511 } 512 else 513 { 514 // The char is not decomposable 515 $utf_seq = array($utf_char); 516 } 517 518 519 // STEP 2: Capture the starter 520 521 // Check out the combining class of the first character of the UTF sequence 522 $k = 0; 523 if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE) 524 { 525 // Not a starter, inspect previous characters 526 // The last 8 characters are kept in a buffer so that we don't have to capture them everytime. 527 // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode, 528 // although it is slower than this method. 529 // 530 // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is 531 // at offset $i) and process them in backward mode until we find a starter. 532 // 533 // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more 534 // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering 535 $starter_found = 0; 536 $j_min = max(1, $i - 7); 537 538 for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j) 539 { 540 $utf_char = $buffer[$j & 7]; 541 $lpos -= strlen($utf_char); 542 543 if (isset($decomp_map[$utf_char])) 544 { 545 // The char is a composite, decompose for storage 546 $decomp_seq = array(); 547 $_pos = 0; 548 $_len = strlen($decomp_map[$utf_char]); 549 550 do 551 { 552 $c = $decomp_map[$utf_char][$_pos]; 553 $_utf_len =& $utf_len_mask[$c & "\xF0"]; 554 555 if (isset($_utf_len)) 556 { 557 $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len); 558 $_pos += $_utf_len; 559 } 560 else 561 { 562 $decomp_seq[] = $c; 563 ++$_pos; 564 } 565 } 566 while ($_pos < $_len); 567 568 // Prepend the UTF sequence with our decomposed sequence 569 if (isset($decomp_seq[1])) 570 { 571 // The char expanded into several chars 572 $decomp_cnt = sizeof($decomp_seq); 573 574 foreach ($decomp_seq as $decomp_i => $decomp_char) 575 { 576 $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char; 577 } 578 $k -= $decomp_cnt; 579 } 580 else 581 { 582 // Decomposed to a single char, easier to prepend 583 $utf_seq[--$k] = $decomp_seq[0]; 584 } 585 } 586 else 587 { 588 $utf_seq[--$k] = $utf_char; 589 } 590 591 if (!isset($utf_combining_class[$utf_seq[$k]])) 592 { 593 // We have found our starter 594 $starter_found = 1; 595 break; 596 } 597 } 598 599 if (!$starter_found && $lpos > $tmp_pos) 600 { 601 // The starter was not found in the buffer, let's rewind some more 602 do 603 { 604 // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte. 605 $c = $str[--$lpos]; 606 $c_mask = $c & "\xF0"; 607 608 if (isset($utf_len_mask[$c_mask])) 609 { 610 // UTF byte 611 if ($utf_len = $utf_len_mask[$c_mask]) 612 { 613 // UTF *leading* byte 614 $utf_char = substr($str, $lpos, $utf_len); 615 616 if (isset($decomp_map[$utf_char])) 617 { 618 // Decompose the character 619 $decomp_seq = array(); 620 $_pos = 0; 621 $_len = strlen($decomp_map[$utf_char]); 622 623 do 624 { 625 $c = $decomp_map[$utf_char][$_pos]; 626 $_utf_len =& $utf_len_mask[$c & "\xF0"]; 627 628 if (isset($_utf_len)) 629 { 630 $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len); 631 $_pos += $_utf_len; 632 } 633 else 634 { 635 $decomp_seq[] = $c; 636 ++$_pos; 637 } 638 } 639 while ($_pos < $_len); 640 641 // Prepend the UTF sequence with our decomposed sequence 642 if (isset($decomp_seq[1])) 643 { 644 // The char expanded into several chars 645 $decomp_cnt = sizeof($decomp_seq); 646 foreach ($decomp_seq as $decomp_i => $utf_char) 647 { 648 $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char; 649 } 650 $k -= $decomp_cnt; 651 } 652 else 653 { 654 // Decomposed to a single char, easier to prepend 655 $utf_seq[--$k] = $decomp_seq[0]; 656 } 657 } 658 else 659 { 660 $utf_seq[--$k] = $utf_char; 661 } 662 } 663 } 664 else 665 { 666 // ASCII char 667 $utf_seq[--$k] = $c; 668 } 669 } 670 while ($lpos > $tmp_pos); 671 } 672 } 673 674 675 // STEP 3: Capture following combining modifiers 676 677 while ($pos < $len) 678 { 679 $c_mask = $str[$pos] & "\xF0"; 680 681 if (isset($utf_len_mask[$c_mask])) 682 { 683 if ($utf_len = $utf_len_mask[$c_mask]) 684 { 685 $utf_char = substr($str, $pos, $utf_len); 686 } 687 else 688 { 689 // A trailing byte came out of nowhere 690 // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop 691 // as if it was a starter (replacement chars ARE starters) and let the next loop replace it 692 break; 693 } 694 695 if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char])) 696 { 697 // Combining character, add it to the sequence and move the cursor 698 if (isset($decomp_map[$utf_char])) 699 { 700 // Decompose the character 701 $_pos = 0; 702 $_len = strlen($decomp_map[$utf_char]); 703 704 do 705 { 706 $c = $decomp_map[$utf_char][$_pos]; 707 $_utf_len =& $utf_len_mask[$c & "\xF0"]; 708 709 if (isset($_utf_len)) 710 { 711 $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len); 712 $_pos += $_utf_len; 713 } 714 else 715 { 716 $utf_seq[] = $c; 717 ++$_pos; 718 } 719 } 720 while ($_pos < $_len); 721 } 722 else 723 { 724 $utf_seq[] = $utf_char; 725 } 726 727 $pos += $utf_len; 728 } 729 else 730 { 731 // Combining class 0 and no QC, break out of the loop 732 // Note: we do not know if that character is valid. If it's not, the next iteration will replace it 733 break; 734 } 735 } 736 else 737 { 738 // ASCII chars are starters 739 break; 740 } 741 } 742 743 744 // STEP 4: Sort and combine 745 746 // Here we sort... 747 $k_max = $k + sizeof($utf_seq); 748 749 if (!$k && $k_max == 1) 750 { 751 // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop 752 // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases 753 // if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos)) 754 // { 755 $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0]; 756 $tmp_pos = $pos; 757 // } 758 759 continue; 760 } 761 762 // ...there we combine 763 if (isset($utf_combining_class[$utf_seq[$k]])) 764 { 765 $starter = $nf_seq = ''; 766 } 767 else 768 { 769 $starter = $utf_seq[$k++]; 770 $nf_seq = ''; 771 } 772 $utf_sort = array(); 773 774 // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine 775 // at the end of the string without altering it 776 $utf_seq[] = ''; 777 778 do 779 { 780 $utf_char = $utf_seq[$k++]; 781 782 if (isset($utf_combining_class[$utf_char])) 783 { 784 $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char; 785 } 786 else 787 { 788 if (empty($utf_sort)) 789 { 790 // No combining characters... check for a composite of the two starters 791 if (isset($utf_canonical_comp[$starter . $utf_char])) 792 { 793 // Good ol' composite character 794 $starter = $utf_canonical_comp[$starter . $utf_char]; 795 } 796 else if (isset($utf_jamo_type[$utf_char])) 797 { 798 // Current char is a composable jamo 799 if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V) 800 { 801 // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo 802 if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T) 803 { 804 // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented) 805 $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]]; 806 ++$k; 807 } 808 else 809 { 810 // L+V jamos, combine to a LV Hangul syllable 811 $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char]; 812 } 813 814 $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F)); 815 } 816 else 817 { 818 // Non-composable jamo, just add it to the sequence 819 $nf_seq .= $starter; 820 $starter = $utf_char; 821 } 822 } 823 else 824 { 825 // No composite, just add the first starter to the sequence then continue with the other one 826 $nf_seq .= $starter; 827 $starter = $utf_char; 828 } 829 } 830 else 831 { 832 ksort($utf_sort); 833 834 // For each class of combining characters 835 foreach ($utf_sort as $cc => $utf_chars) 836 { 837 $j = 0; 838 839 do 840 { 841 // Look for a composite 842 if (isset($utf_canonical_comp[$starter . $utf_chars[$j]])) 843 { 844 // Found a composite, replace the starter 845 $starter = $utf_canonical_comp[$starter . $utf_chars[$j]]; 846 unset($utf_sort[$cc][$j]); 847 } 848 else 849 { 850 // No composite, all following characters in that class are blocked 851 break; 852 } 853 } 854 while (isset($utf_sort[$cc][++$j])); 855 } 856 857 // Add the starter to the normalized sequence, followed by non-starters in canonical order 858 $nf_seq .= $starter; 859 860 foreach ($utf_sort as $utf_chars) 861 { 862 if (!empty($utf_chars)) 863 { 864 $nf_seq .= implode('', $utf_chars); 865 } 866 } 867 868 // Reset the array and go on 869 $utf_sort = array(); 870 $starter = $utf_char; 871 } 872 } 873 } 874 while ($k <= $k_max); 875 876 $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq; 877 $tmp_pos = $pos; 878 } 879 else 880 { 881 // Only a ASCII char can make the program get here 882 // 883 // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn(). 884 // 885 // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on 886 // multi-byte text (where the only ASCII chars are spaces and punctuation) 887 if (++$pos != $len) 888 { 889 if ($str[$pos] < "\x80") 890 { 891 $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos); 892 $buffer[++$i & 7] = $str[$pos - 1]; 893 } 894 else 895 { 896 $buffer[++$i & 7] = $c; 897 } 898 } 899 } 900 } 901 while ($pos < $len); 902 903 // Now is time to return the string 904 if ($tmp_pos) 905 { 906 // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version 907 if ($tmp_pos == $len) 908 { 909 // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str 910 return $tmp; 911 } 912 else 913 { 914 // The rightmost chunk of $str has not been appended to $tmp yet 915 return $tmp . substr($str, $tmp_pos); 916 } 917 } 918 919 // The string was already in normal form 920 return $str; 921 } 922 923 /** 924 * Decompose a UTF string 925 * 926 * @param string $str UTF string 927 * @param integer $pos Position of the first UTF char (in bytes) 928 * @param integer $len Length of the string (in bytes) 929 * @param array $decomp_map Decomposition mapping, passed by reference but never modified 930 * @return string The string, decomposed and sorted canonically 931 * 932 * @access private 933 */ 934 function decompose($str, $pos, $len, &$decomp_map) 935 { 936 global $utf_combining_class, $utf_canonical_decomp, $phpbb_root_path; 937 938 // Load some commonly-used tables 939 if (!isset($utf_combining_class)) 940 { 941 include ($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.php'); 942 } 943 944 // UTF char length array 945 $utf_len_mask = array( 946 // Leading bytes masks 947 "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4, 948 // Trailing bytes masks 949 "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0 950 ); 951 952 // Some extra checks are triggered on the first byte of a UTF sequence 953 $extra_check = array( 954 "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1, 955 "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1, 956 "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1 957 ); 958 959 // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge: 960 // - 2-byte: 110? ???? 10?? ???? 961 // - 3-byte: 1110 ???? 10?? ???? 10?? ???? 962 // - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ???? 963 // Note that 5- and 6- byte sequences are automatically discarded 964 $utf_validation_mask = array( 965 2 => "\xE0\xC0", 966 3 => "\xF0\xC0\xC0", 967 4 => "\xF8\xC0\xC0\xC0" 968 ); 969 970 $utf_validation_check = array( 971 2 => "\xC0\x80", 972 3 => "\xE0\x80\x80", 973 4 => "\xF0\x80\x80\x80" 974 ); 975 976 $tmp = ''; 977 $starter_pos = $pos; 978 $tmp_pos = $last_cc = $sort = $dump = 0; 979 $utf_sort = array(); 980 981 982 // Main loop 983 do 984 { 985 // STEP 0: Capture the current char 986 987 $cur_mask = $str[$pos] & "\xF0"; 988 if (isset($utf_len_mask[$cur_mask])) 989 { 990 if ($utf_len = $utf_len_mask[$cur_mask]) 991 { 992 // Multibyte char 993 $utf_char = substr($str, $pos, $utf_len); 994 $pos += $utf_len; 995 } 996 else 997 { 998 // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode 999 // replacement char and we will advance the cursor 1000 $spn = strspn($str, UTF8_TRAILING_BYTES, $pos); 1001 1002 if ($dump) 1003 { 1004 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1005 1006 // Dump combiners 1007 if (!empty($utf_sort)) 1008 { 1009 if ($sort) 1010 { 1011 ksort($utf_sort); 1012 } 1013 1014 foreach($utf_sort as $utf_chars) 1015 { 1016 $tmp .= implode('', $utf_chars); 1017 } 1018 } 1019 1020 $tmp .= str_repeat(UTF8_REPLACEMENT, $spn); 1021 $dump = $sort = 0; 1022 } 1023 else 1024 { 1025 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn); 1026 } 1027 1028 $pos += $spn; 1029 $tmp_pos = $starter_pos = $pos; 1030 1031 $utf_sort = array(); 1032 $last_cc = 0; 1033 1034 continue; 1035 } 1036 1037 1038 // STEP 1: Decide what to do with current char 1039 1040 // Now, in that order: 1041 // - check if that character is decomposable 1042 // - check if that character is a non-starter 1043 // - check if that character requires extra checks to be performed 1044 if (isset($decomp_map[$utf_char])) 1045 { 1046 // Decompose the char 1047 $_pos = 0; 1048 $_len = strlen($decomp_map[$utf_char]); 1049 1050 do 1051 { 1052 $c = $decomp_map[$utf_char][$_pos]; 1053 $_utf_len =& $utf_len_mask[$c & "\xF0"]; 1054 1055 if (isset($_utf_len)) 1056 { 1057 $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len); 1058 $_pos += $_utf_len; 1059 1060 if (isset($utf_combining_class[$_utf_char])) 1061 { 1062 // The character decomposed to a non-starter, buffer it for sorting 1063 $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char; 1064 1065 if ($utf_combining_class[$_utf_char] < $last_cc) 1066 { 1067 // Not canonically ordered, will require sorting 1068 $sort = $dump = 1; 1069 } 1070 else 1071 { 1072 $dump = 1; 1073 $last_cc = $utf_combining_class[$_utf_char]; 1074 } 1075 } 1076 else 1077 { 1078 // This character decomposition contains a starter, dump the buffer and continue 1079 if ($dump) 1080 { 1081 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1082 1083 // Dump combiners 1084 if (!empty($utf_sort)) 1085 { 1086 if ($sort) 1087 { 1088 ksort($utf_sort); 1089 } 1090 1091 foreach ($utf_sort as $utf_chars) 1092 { 1093 $tmp .= implode('', $utf_chars); 1094 } 1095 } 1096 1097 $tmp .= $_utf_char; 1098 $dump = $sort = 0; 1099 } 1100 else 1101 { 1102 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char; 1103 } 1104 1105 $tmp_pos = $starter_pos = $pos; 1106 $utf_sort = array(); 1107 $last_cc = 0; 1108 } 1109 } 1110 else 1111 { 1112 // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue 1113 ++$_pos; 1114 1115 if ($dump) 1116 { 1117 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1118 1119 // Dump combiners 1120 if (!empty($utf_sort)) 1121 { 1122 if ($sort) 1123 { 1124 ksort($utf_sort); 1125 } 1126 1127 foreach ($utf_sort as $utf_chars) 1128 { 1129 $tmp .= implode('', $utf_chars); 1130 } 1131 } 1132 1133 $tmp .= $c; 1134 $dump = $sort = 0; 1135 } 1136 else 1137 { 1138 $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c; 1139 } 1140 1141 $tmp_pos = $starter_pos = $pos; 1142 $utf_sort = array(); 1143 $last_cc = 0; 1144 } 1145 } 1146 while ($_pos < $_len); 1147 } 1148 else if (isset($utf_combining_class[$utf_char])) 1149 { 1150 // Combining character 1151 if ($utf_combining_class[$utf_char] < $last_cc) 1152 { 1153 // Not in canonical order 1154 $sort = $dump = 1; 1155 } 1156 else 1157 { 1158 $last_cc = $utf_combining_class[$utf_char]; 1159 } 1160 1161 $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char; 1162 } 1163 else 1164 { 1165 // Non-decomposable starter, check out if it's a Hangul syllable 1166 if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST) 1167 { 1168 // Nope, regular UTF char, check that we have the correct number of trailing bytes 1169 if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len]) 1170 { 1171 // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char 1172 // has been encoded in a five- or six- byte sequence. 1173 // Move the cursor back to its original position then advance it to the position it should really be at 1174 $pos -= $utf_len; 1175 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1176 1177 if (!empty($utf_sort)) 1178 { 1179 ksort($utf_sort); 1180 1181 foreach ($utf_sort as $utf_chars) 1182 { 1183 $tmp .= implode('', $utf_chars); 1184 } 1185 $utf_sort = array(); 1186 } 1187 1188 // Add a replacement char then another replacement char for every trailing byte. 1189 // 1190 // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this 1191 $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos); 1192 $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1); 1193 1194 $dump = $sort = 0; 1195 1196 $pos += $spn; 1197 $tmp_pos = $pos; 1198 continue; 1199 } 1200 1201 if (isset($extra_check[$utf_char[0]])) 1202 { 1203 switch ($utf_char[0]) 1204 { 1205 // Note: 0xED is quite common in Korean 1206 case "\xED": 1207 if ($utf_char >= "\xED\xA0\x80") 1208 { 1209 // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF) 1210 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1211 1212 if (!empty($utf_sort)) 1213 { 1214 ksort($utf_sort); 1215 1216 foreach ($utf_sort as $utf_chars) 1217 { 1218 $tmp .= implode('', $utf_chars); 1219 } 1220 $utf_sort = array(); 1221 } 1222 1223 $tmp .= UTF8_REPLACEMENT; 1224 $dump = $sort = 0; 1225 1226 $tmp_pos = $starter_pos = $pos; 1227 continue 2; 1228 } 1229 break; 1230 1231 // Note: 0xEF is quite common in Japanese 1232 case "\xEF": 1233 if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF") 1234 { 1235 // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF) 1236 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1237 1238 if (!empty($utf_sort)) 1239 { 1240 ksort($utf_sort); 1241 1242 foreach ($utf_sort as $utf_chars) 1243 { 1244 $tmp .= implode('', $utf_chars); 1245 } 1246 $utf_sort = array(); 1247 } 1248 1249 $tmp .= UTF8_REPLACEMENT; 1250 $dump = $sort = 0; 1251 1252 $tmp_pos = $starter_pos = $pos; 1253 continue 2; 1254 } 1255 break; 1256 1257 case "\xC0": 1258 case "\xC1": 1259 if ($utf_char <= "\xC1\xBF") 1260 { 1261 // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char 1262 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1263 1264 if (!empty($utf_sort)) 1265 { 1266 ksort($utf_sort); 1267 1268 foreach ($utf_sort as $utf_chars) 1269 { 1270 $tmp .= implode('', $utf_chars); 1271 } 1272 $utf_sort = array(); 1273 } 1274 1275 $tmp .= UTF8_REPLACEMENT; 1276 $dump = $sort = 0; 1277 1278 $tmp_pos = $starter_pos = $pos; 1279 continue 2; 1280 } 1281 break; 1282 1283 case "\xE0": 1284 if ($utf_char <= "\xE0\x9F\xBF") 1285 { 1286 // Unicode char U+0000..U+07FF encoded in 3 bytes 1287 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1288 1289 if (!empty($utf_sort)) 1290 { 1291 ksort($utf_sort); 1292 1293 foreach ($utf_sort as $utf_chars) 1294 { 1295 $tmp .= implode('', $utf_chars); 1296 } 1297 $utf_sort = array(); 1298 } 1299 1300 $tmp .= UTF8_REPLACEMENT; 1301 $dump = $sort = 0; 1302 1303 $tmp_pos = $starter_pos = $pos; 1304 continue 2; 1305 } 1306 break; 1307 1308 case "\xF0": 1309 if ($utf_char <= "\xF0\x8F\xBF\xBF") 1310 { 1311 // Unicode char U+0000..U+FFFF encoded in 4 bytes 1312 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1313 1314 if (!empty($utf_sort)) 1315 { 1316 ksort($utf_sort); 1317 1318 foreach ($utf_sort as $utf_chars) 1319 { 1320 $tmp .= implode('', $utf_chars); 1321 } 1322 $utf_sort = array(); 1323 } 1324 1325 $tmp .= UTF8_REPLACEMENT; 1326 $dump = $sort = 0; 1327 1328 $tmp_pos = $starter_pos = $pos; 1329 continue 2; 1330 } 1331 break; 1332 1333 default: 1334 if ($utf_char > UTF8_MAX) 1335 { 1336 // Out of the Unicode range 1337 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1338 1339 if (!empty($utf_sort)) 1340 { 1341 ksort($utf_sort); 1342 1343 foreach ($utf_sort as $utf_chars) 1344 { 1345 $tmp .= implode('', $utf_chars); 1346 } 1347 $utf_sort = array(); 1348 } 1349 1350 $tmp .= UTF8_REPLACEMENT; 1351 $dump = $sort = 0; 1352 1353 $tmp_pos = $starter_pos = $pos; 1354 continue 2; 1355 } 1356 break; 1357 } 1358 } 1359 } 1360 else 1361 { 1362 // Hangul syllable 1363 $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE; 1364 1365 // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase). 1366 // 1367 // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte 1368 if ($tIndex = $idx % UNICODE_HANGUL_TCOUNT) 1369 { 1370 if ($tIndex < 25) 1371 { 1372 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00"; 1373 $utf_char[8] = chr(0xA7 + $tIndex); 1374 } 1375 else 1376 { 1377 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00"; 1378 $utf_char[8] = chr(0x67 + $tIndex); 1379 } 1380 } 1381 else 1382 { 1383 $utf_char = "\xE1\x84\x00\xE1\x85\x00"; 1384 } 1385 1386 $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT)); 1387 $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT)); 1388 1389 // Just like other decompositions, the resulting Jamos must be dumped to the tmp string 1390 $dump = 1; 1391 } 1392 1393 // Do we need to dump stuff to the tmp string? 1394 if ($dump) 1395 { 1396 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1397 1398 // Dump combiners 1399 if (!empty($utf_sort)) 1400 { 1401 if ($sort) 1402 { 1403 ksort($utf_sort); 1404 } 1405 1406 foreach ($utf_sort as $utf_chars) 1407 { 1408 $tmp .= implode('', $utf_chars); 1409 } 1410 } 1411 1412 $tmp .= $utf_char; 1413 $dump = $sort = 0; 1414 $tmp_pos = $pos; 1415 } 1416 1417 $last_cc = 0; 1418 $utf_sort = array(); 1419 $starter_pos = $pos; 1420 } 1421 } 1422 else 1423 { 1424 // ASCII char, which happens to be a starter (as any other ASCII char) 1425 if ($dump) 1426 { 1427 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1428 1429 // Dump combiners 1430 if (!empty($utf_sort)) 1431 { 1432 if ($sort) 1433 { 1434 ksort($utf_sort); 1435 } 1436 1437 foreach ($utf_sort as $utf_chars) 1438 { 1439 $tmp .= implode('', $utf_chars); 1440 } 1441 } 1442 1443 $tmp .= $str[$pos]; 1444 $dump = $sort = 0; 1445 $tmp_pos = ++$pos; 1446 1447 $pos += strspn($str, UTF8_ASCII_RANGE, $pos); 1448 } 1449 else 1450 { 1451 $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos); 1452 } 1453 1454 $last_cc = 0; 1455 $utf_sort = array(); 1456 $starter_pos = $pos; 1457 } 1458 } 1459 while ($pos < $len); 1460 1461 // Now is time to return the string 1462 if ($dump) 1463 { 1464 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos); 1465 1466 // Dump combiners 1467 if (!empty($utf_sort)) 1468 { 1469 if ($sort) 1470 { 1471 ksort($utf_sort); 1472 } 1473 1474 foreach ($utf_sort as $utf_chars) 1475 { 1476 $tmp .= implode('', $utf_chars); 1477 } 1478 } 1479 1480 return $tmp; 1481 1482 } 1483 else if ($tmp_pos) 1484 { 1485 // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version 1486 if ($tmp_pos == $len) 1487 { 1488 // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str 1489 return $tmp; 1490 } 1491 else 1492 { 1493 // The rightmost chunk of $str has not been appended to $tmp yet 1494 return $tmp . substr($str, $tmp_pos); 1495 } 1496 } 1497 1498 // The string was already in normal form 1499 return $str; 1500 } 1501 } 1502 1503 ?>
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Wed Nov 22 00:35:05 2006 | Cross-referenced by PHPXref 0.6 |