PHPXRef 0.6 : phpBB 3.0 Beta 3 : /includes/utf/utf

[Summary view] [Print] [Text view]
   1  <?php
   2  /** 
   3  *
   4  * @package phpBB3
   5  * @version $Id: utf_normalizer.php,v 1.6 2006/11/03 23:09:15 davidmj Exp $ 
   6  * @copyright (c) 2005 phpBB Group 
   7  * @license http://opensource.org/licenses/gpl-license.php GNU Public License 
   8  *
   9  */
  10  
  11  /**
  12  * Some Unicode characters encoded in UTF-8
  13  *
  14  * Preserved for compatibility
  15  */
  16  define('UTF8_REPLACEMENT', "\xEF\xBF\xBD");
  17  define('UTF8_MAX', "\xF4\x8F\xBF\xBF");
  18  define('UTF8_FFFE', "\xEF\xBF\xBE");
  19  define('UTF8_FFFF', "\xEF\xBF\xBF");
  20  define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80");
  21  define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");
  22  define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");
  23  define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");
  24  
  25  define('UTF8_CJK_FIRST', "\xE4\xB8\x80");
  26  define('UTF8_CJK_LAST', "\xE9\xBE\xBB");
  27  define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");
  28  define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");
  29  
  30  // Unset global variables
  31  unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
  32  
  33  // NFC_QC and NFKC_QC values
  34  define('UNICODE_QC_MAYBE', 0);
  35  define('UNICODE_QC_NO', 1);
  36  
  37  // Contains all the ASCII characters appearing in UTF-8, sorted by frequency
  38  define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F");
  39  
  40  // Contains all the tail bytes that can appear in the composition of a UTF-8 char
  41  define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A");
  42  
  43  // Constants used by the Hangul [de]composition algorithms
  44  define('UNICODE_HANGUL_SBASE', 0xAC00);
  45  define('UNICODE_HANGUL_LBASE', 0x1100);
  46  define('UNICODE_HANGUL_VBASE', 0x1161);
  47  define('UNICODE_HANGUL_TBASE', 0x11A7);
  48  define('UNICODE_HANGUL_SCOUNT', 11172);
  49  define('UNICODE_HANGUL_LCOUNT', 19);
  50  define('UNICODE_HANGUL_VCOUNT', 21);
  51  define('UNICODE_HANGUL_TCOUNT', 28);
  52  define('UNICODE_HANGUL_NCOUNT', 588);
  53  define('UNICODE_JAMO_L', 0);
  54  define('UNICODE_JAMO_V', 1);
  55  define('UNICODE_JAMO_T', 2);
  56  
  57  /**
  58  * Unicode normalization routines
  59  *
  60  * @package phpBB3
  61  */
  62  class utf_normalizer
  63  {
  64      /**
  65      * Validate, cleanup and normalize a string
  66      *
  67      * The ultimate convenience function! Clean up invalid UTF-8 sequences,
  68      * and convert to Normal Form C, canonical composition.
  69      *
  70      * @param    string    $str    The dirty string
  71      * @return    string            The same string, all shiny and cleaned-up
  72      */
  73  	function cleanup($str)
  74      {
  75          // The string below is the list of all autorized characters, sorted by frequency in latin text
  76          $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
  77          $len = strlen($str);
  78  
  79          if ($pos == $len)
  80          {
  81              // ASCII strings with no special chars return immediately
  82              return $str;
  83          }
  84  
  85          // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
  86          if (!isset($GLOBALS['utf_nfc_qc']))
  87          {
  88              global $phpbb_root_path, $phpEx;
  89              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
  90          }
  91  
  92          // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
  93          // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
  94          return utf_normalizer::recompose(
  95              strtr(
  96                  $str,
  97                  "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
  98                  "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
  99              ),
 100              $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']
 101          );
 102      }
 103  
 104      /**
 105      * Validate and normalize a UTF string to NFC
 106      *
 107      * @param    string    $str    Unchecked UTF string
 108      * @return    string            The string, validated and in normal form
 109      */
 110  	function nfc($str)
 111      {
 112          $pos = strspn($str, UTF8_ASCII_RANGE);
 113          $len = strlen($str);
 114  
 115          if ($pos == $len)
 116          {
 117              // ASCII strings return immediately
 118              return $str;
 119          }
 120  
 121          if (!isset($GLOBALS['utf_nfc_qc']))
 122          {
 123              global $phpbb_root_path, $phpEx;
 124              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
 125          }
 126  
 127          return utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
 128      }
 129  
 130      /**
 131      * Validate and normalize a UTF string to NFKC
 132      *
 133      * @param    string    $str    Unchecked UTF string
 134      * @return    string            The string, validated and in normal form
 135      */
 136  	function nfkc($str)
 137      {
 138          $pos = strspn($str, UTF8_ASCII_RANGE);
 139          $len = strlen($str);
 140  
 141          if ($pos == $len)
 142          {
 143              // ASCII strings return immediately
 144              return $str;
 145          }
 146  
 147          if (!isset($GLOBALS['utf_nfkc_qc']))
 148          {
 149              global $phpbb_root_path, $phpEx;
 150              include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);
 151          }
 152  
 153          if (!isset($GLOBALS['utf_canonical_comp']))
 154          {
 155              global $phpbb_root_path, $phpEx;
 156              include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
 157          }
 158  
 159          return utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
 160      }
 161  
 162      /**
 163      * Validate and normalize a UTF string to NFD
 164      *
 165      * @param    string    $str    Unchecked UTF string
 166      * @return    string            The string, validated and in normal form
 167      */
 168  	function nfd($str)
 169      {
 170          $pos = strspn($str, UTF8_ASCII_RANGE);
 171          $len = strlen($str);
 172  
 173          if ($pos == $len)
 174          {
 175              // ASCII strings return immediately
 176              return $str;
 177          }
 178  
 179          if (!isset($GLOBALS['utf_canonical_decomp']))
 180          {
 181              global $phpbb_root_path, $phpEx;
 182              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
 183          }
 184  
 185          return utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
 186      }
 187  
 188      /**
 189      * Validate and normalize a UTF string to NFKD
 190      *
 191      * @param    string    $str    Unchecked UTF string
 192      * @return    string            The string, validated and in normal form
 193      */
 194  	function nfkd($str)
 195      {
 196          $pos = strspn($str, UTF8_ASCII_RANGE);
 197          $len = strlen($str);
 198  
 199          if ($pos == $len)
 200          {
 201              // ASCII strings return immediately
 202              return $str;
 203          }
 204  
 205          if (!isset($GLOBALS['utf_compatibility_decomp']))
 206          {
 207              global $phpbb_root_path, $phpEx;
 208              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
 209          }
 210  
 211          return utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
 212      }
 213  
 214  
 215      /**
 216      * Recompose a UTF string
 217      *
 218      * @param    string    $str        Unchecked UTF string
 219      * @param    integer    $pos        Position of the first UTF char (in bytes)
 220      * @param    integer    $len        Length of the string (in bytes)
 221      * @param    array    $qc            Quick-check array, passed by reference but never modified
 222      * @param    array    $decomp_map    Decomposition mapping, passed by reference but never modified
 223      * @return    string                The string, validated and recomposed
 224      *
 225      * @access    private
 226      */
 227  	function recompose($str, $pos, $len, &$qc, &$decomp_map)
 228      {
 229          global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index;
 230  
 231          // Load some commonly-used tables
 232          if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class))
 233          {
 234              global $phpbb_root_path;
 235              include ($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.php');
 236          }
 237  
 238          // Buffer the last ASCII char before the UTF-8 stuff if applicable
 239          $tmp = '';
 240          $i = $tmp_pos = $last_cc = 0;
 241  
 242          if ($pos)
 243          {
 244              $buffer = array(++$i => $str[$pos - 1]);
 245          }
 246          else
 247          {
 248              $buffer = array();
 249          }
 250  
 251          // UTF char length array
 252          // This array is used to determine the length of a UTF character.
 253          // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos
 254          // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.
 255          // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character
 256          // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
 257          $utf_len_mask = array(
 258              // Leading bytes masks
 259              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
 260              // Trailing bytes masks
 261              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
 262          );
 263  
 264          $extra_check = array(
 265              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
 266              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
 267              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
 268          );
 269  
 270          $utf_validation_mask = array(
 271              2    => "\xE0\xC0",
 272              3    => "\xF0\xC0\xC0",
 273              4    => "\xF8\xC0\xC0\xC0"
 274          );
 275  
 276          $utf_validation_check = array(
 277              2    => "\xC0\x80",
 278              3    => "\xE0\x80\x80",
 279              4    => "\xF0\x80\x80\x80"
 280          );
 281  
 282          // Main loop
 283          do
 284          {
 285              // STEP 0: Capture the current char and buffer it
 286              $c = $str[$pos];
 287              $c_mask = $c & "\xF0";
 288  
 289              if (isset($utf_len_mask[$c_mask]))
 290              {
 291                  // Byte at $pos is either a leading byte or a missplaced trailing byte
 292                  if ($utf_len = $utf_len_mask[$c_mask])
 293                  {
 294                      // Capture the char
 295                      $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len);
 296  
 297                      // Let's find out if a thorough check is needed
 298                      if (isset($qc[$utf_char]))
 299                      {
 300                          // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block
 301                      }
 302                      else if (isset($utf_combining_class[$utf_char]))
 303                      {
 304                          if ($utf_combining_class[$utf_char] < $last_cc)
 305                          {
 306                              // A combining character that is NOT canonically ordered
 307                          }
 308                          else
 309                          {
 310                              // A combining character that IS canonically ordered, skip to the next char
 311                              $last_cc = $utf_combining_class[$utf_char];
 312  
 313                              $pos += $utf_len;
 314                              continue;
 315                          }
 316                      }
 317                      else
 318                      {
 319                          // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.
 320                          // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out
 321                          $last_cc = 0;
 322  
 323                          // Check that we have the correct number of trailing bytes
 324                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
 325                          {
 326                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
 327                              // has been encoded in a five- or six- byte sequence
 328                              if ($utf_char[0] >= "\xF8")
 329                              {
 330                                  if ($utf_char[0] < "\xF8")
 331                                  {
 332                                      $trailing_bytes = 3;
 333                                  }
 334                                  else if ($utf_char[0] < "\xFC")
 335                                  {
 336                                      $trailing_bytes = 4;
 337                                  }
 338  
 339                                  if ($utf_char[0] > "\xFD")
 340                                  {
 341                                      $trailing_bytes = 0;
 342                                  }
 343                                  else
 344                                  {
 345                                      $trailing_bytes = 5;
 346                                  }
 347                              }
 348                              else
 349                              {
 350                                  $trailing_bytes = $utf_len - 1;
 351                              }
 352  
 353                              $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 354                              $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
 355                              $tmp_pos = $pos;
 356  
 357                              continue;
 358                          }
 359  
 360                          if (isset($extra_check[$c]))
 361                          {
 362                              switch ($c)
 363                              {
 364                                  // Note: 0xED is quite common in Korean
 365                                  case "\xED":
 366                                      if ($utf_char >= "\xED\xA0\x80")
 367                                      {
 368                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
 369                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 370                                          $pos += $utf_len;
 371                                          $tmp_pos = $pos;
 372                                          continue 2;
 373                                      }
 374                                  break;
 375  
 376                                  // Note: 0xEF is quite common in Japanese
 377                                  case "\xEF":
 378                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
 379                                      {
 380                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
 381                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 382                                          $pos += $utf_len;
 383                                          $tmp_pos = $pos;
 384                                          continue 2;
 385                                      }
 386                                  break;
 387  
 388                                  case "\xC0":
 389                                  case "\xC1":
 390                                      if ($utf_char <= "\xC1\xBF")
 391                                      {
 392                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
 393                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 394                                          $pos += $utf_len;
 395                                          $tmp_pos = $pos;
 396                                          continue 2;
 397                                      }
 398                                  break;
 399  
 400                                  case "\xE0":
 401                                      if ($utf_char <= "\xE0\x9F\xBF")
 402                                      {
 403                                          // Unicode char U+0000..U+07FF encoded in 3 bytes
 404                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 405                                          $pos += $utf_len;
 406                                          $tmp_pos = $pos;
 407                                          continue 2;
 408                                      }
 409                                  break;
 410  
 411                                  case "\xF0":
 412                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")
 413                                      {
 414                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes
 415                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 416                                          $pos += $utf_len;
 417                                          $tmp_pos = $pos;
 418                                          continue 2;
 419                                      }
 420                                  break;
 421  
 422                                  default:
 423                                      // Five- and six- byte sequences do not need being checked for here anymore
 424                                      if ($utf_char > UTF8_MAX)
 425                                      {
 426                                          // Out of the Unicode range
 427                                          if ($utf_char[0] < "\xF8")
 428                                          {
 429                                              $trailing_bytes = 3;
 430                                          }
 431                                          else if ($utf_char[0] < "\xFC")
 432                                          {
 433                                              $trailing_bytes = 4;
 434                                          }
 435                                          else if ($utf_char[0] > "\xFD")
 436                                          {
 437                                              $trailing_bytes = 0;
 438                                          }
 439                                          else
 440                                          {
 441                                              $trailing_bytes = 5;
 442                                          }
 443  
 444                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
 445                                          $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
 446                                          $tmp_pos = $pos;
 447                                          continue 2;
 448                                      }
 449                                  break;
 450                              }
 451                          }
 452  
 453                          // The char is a valid starter, move the cursor and go on
 454                          $pos += $utf_len;
 455                          continue;
 456                      }
 457                  }
 458                  else
 459                  {
 460                      // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if
 461                      // each of them was a Unicode replacement char
 462                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
 463                      $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
 464  
 465                      $pos += $spn;
 466                      $tmp_pos = $pos;
 467                      continue;
 468                  }
 469  
 470  
 471                  // STEP 1: Decompose current char
 472  
 473                  // We have found a character that is either:
 474                  //  - in the NFC_QC/NFKC_QC list
 475                  //  - a non-starter char that is not canonically ordered
 476                  //
 477                  // We are going to capture the shortest UTF sequence that satisfies these two conditions:
 478                  //
 479                  //  1 - If the sequence does not start at the begginning of the string, it must begin with a starter,
 480                  // and that starter must not have the NF[K]C_QC property equal to "MAYBE"
 481                  //
 482                  //  2 - If the sequence does not end at the end of the string, it must end with a non-starter and be
 483                  // immediately followed by a starter that is not on the QC list
 484                  //
 485                  $utf_seq = array();
 486                  $last_cc = 0;
 487                  $lpos = $pos;
 488                  $pos += $utf_len;
 489  
 490                  if (isset($decomp_map[$utf_char]))
 491                  {
 492                      $_pos = 0;
 493                      $_len = strlen($decomp_map[$utf_char]);
 494  
 495                      do
 496                      {
 497                          $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];
 498  
 499                          if (isset($_utf_len))
 500                          {
 501                              $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 502                              $_pos += $_utf_len;
 503                          }
 504                          else
 505                          {
 506                              $utf_seq[] = $decomp_map[$utf_char][$_pos];
 507                              ++$_pos;
 508                          }
 509                      }
 510                      while ($_pos < $_len);
 511                  }
 512                  else
 513                  {
 514                      // The char is not decomposable
 515                      $utf_seq = array($utf_char);
 516                  }
 517  
 518  
 519                  // STEP 2: Capture the starter
 520  
 521                  // Check out the combining class of the first character of the UTF sequence
 522                  $k = 0;
 523                  if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)
 524                  {
 525                      // Not a starter, inspect previous characters
 526                      // The last 8 characters are kept in a buffer so that we don't have to capture them everytime.
 527                      // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,
 528                      // although it is slower than this method.
 529                      //
 530                      // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is
 531                      // at offset $i) and process them in backward mode until we find a starter.
 532                      //
 533                      // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more
 534                      // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering
 535                      $starter_found = 0;
 536                      $j_min = max(1, $i - 7);
 537  
 538                      for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)
 539                      {
 540                          $utf_char = $buffer[$j & 7];
 541                          $lpos -= strlen($utf_char);
 542  
 543                          if (isset($decomp_map[$utf_char]))
 544                          {
 545                              // The char is a composite, decompose for storage
 546                              $decomp_seq = array();
 547                              $_pos = 0;
 548                              $_len = strlen($decomp_map[$utf_char]);
 549  
 550                              do
 551                              {
 552                                  $c = $decomp_map[$utf_char][$_pos];
 553                                  $_utf_len =& $utf_len_mask[$c & "\xF0"];
 554  
 555                                  if (isset($_utf_len))
 556                                  {
 557                                      $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 558                                      $_pos += $_utf_len;
 559                                  }
 560                                  else
 561                                  {
 562                                      $decomp_seq[] = $c;
 563                                      ++$_pos;
 564                                  }
 565                              }
 566                              while ($_pos < $_len);
 567  
 568                              // Prepend the UTF sequence with our decomposed sequence
 569                              if (isset($decomp_seq[1]))
 570                              {
 571                                  // The char expanded into several chars
 572                                  $decomp_cnt = sizeof($decomp_seq);
 573  
 574                                  foreach ($decomp_seq as $decomp_i => $decomp_char)
 575                                  {
 576                                      $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;
 577                                  }
 578                                  $k -= $decomp_cnt;
 579                              }
 580                              else
 581                              {
 582                                  // Decomposed to a single char, easier to prepend
 583                                  $utf_seq[--$k] = $decomp_seq[0];
 584                              }
 585                          }
 586                          else
 587                          {
 588                              $utf_seq[--$k] = $utf_char;
 589                          }
 590  
 591                          if (!isset($utf_combining_class[$utf_seq[$k]]))
 592                          {
 593                              // We have found our starter
 594                              $starter_found = 1;
 595                              break;
 596                          }
 597                      }
 598  
 599                      if (!$starter_found && $lpos > $tmp_pos)
 600                      {
 601                          // The starter was not found in the buffer, let's rewind some more
 602                          do
 603                          {
 604                              // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.
 605                              $c = $str[--$lpos];
 606                              $c_mask = $c & "\xF0";
 607  
 608                              if (isset($utf_len_mask[$c_mask]))
 609                              {
 610                                  // UTF byte
 611                                  if ($utf_len = $utf_len_mask[$c_mask])
 612                                  {
 613                                      // UTF *leading* byte
 614                                      $utf_char = substr($str, $lpos, $utf_len);
 615  
 616                                      if (isset($decomp_map[$utf_char]))
 617                                      {
 618                                          // Decompose the character
 619                                          $decomp_seq = array();
 620                                          $_pos = 0;
 621                                          $_len = strlen($decomp_map[$utf_char]);
 622  
 623                                          do
 624                                          {
 625                                              $c = $decomp_map[$utf_char][$_pos];
 626                                              $_utf_len =& $utf_len_mask[$c & "\xF0"];
 627  
 628                                              if (isset($_utf_len))
 629                                              {
 630                                                  $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 631                                                  $_pos += $_utf_len;
 632                                              }
 633                                              else
 634                                              {
 635                                                  $decomp_seq[] = $c;
 636                                                  ++$_pos;
 637                                              }
 638                                          }
 639                                          while ($_pos < $_len);
 640  
 641                                          // Prepend the UTF sequence with our decomposed sequence
 642                                          if (isset($decomp_seq[1]))
 643                                          {
 644                                              // The char expanded into several chars
 645                                              $decomp_cnt = sizeof($decomp_seq);
 646                                              foreach ($decomp_seq as $decomp_i => $utf_char)
 647                                              {
 648                                                  $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;
 649                                              }
 650                                              $k -= $decomp_cnt;
 651                                          }
 652                                          else
 653                                          {
 654                                              // Decomposed to a single char, easier to prepend
 655                                              $utf_seq[--$k] = $decomp_seq[0];
 656                                          }
 657                                      }
 658                                      else
 659                                      {
 660                                          $utf_seq[--$k] = $utf_char;
 661                                      }
 662                                  }
 663                              }
 664                              else
 665                              {
 666                                  // ASCII char
 667                                  $utf_seq[--$k] = $c;
 668                              }
 669                          }
 670                          while ($lpos > $tmp_pos);
 671                      }
 672                  }
 673  
 674  
 675                  // STEP 3: Capture following combining modifiers
 676  
 677                  while ($pos < $len)
 678                  {
 679                      $c_mask = $str[$pos] & "\xF0";
 680  
 681                      if (isset($utf_len_mask[$c_mask]))
 682                      {
 683                          if ($utf_len = $utf_len_mask[$c_mask])
 684                          {
 685                              $utf_char = substr($str, $pos, $utf_len);
 686                          }
 687                          else
 688                          {
 689                              // A trailing byte came out of nowhere
 690                              // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop
 691                              // as if it was a starter (replacement chars ARE starters) and let the next loop replace it
 692                              break;
 693                          }
 694  
 695                          if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char]))
 696                          {
 697                              // Combining character, add it to the sequence and move the cursor
 698                              if (isset($decomp_map[$utf_char]))
 699                              {
 700                                  // Decompose the character
 701                                  $_pos = 0;
 702                                  $_len = strlen($decomp_map[$utf_char]);
 703  
 704                                  do
 705                                  {
 706                                      $c = $decomp_map[$utf_char][$_pos];
 707                                      $_utf_len =& $utf_len_mask[$c & "\xF0"];
 708  
 709                                      if (isset($_utf_len))
 710                                      {
 711                                          $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
 712                                          $_pos += $_utf_len;
 713                                      }
 714                                      else
 715                                      {
 716                                          $utf_seq[] = $c;
 717                                          ++$_pos;
 718                                      }
 719                                  }
 720                                  while ($_pos < $_len);
 721                              }
 722                              else
 723                              {
 724                                  $utf_seq[] = $utf_char;
 725                              }
 726  
 727                              $pos += $utf_len;
 728                          }
 729                          else
 730                          {
 731                              // Combining class 0 and no QC, break out of the loop
 732                              // Note: we do not know if that character is valid. If it's not, the next iteration will replace it
 733                              break;
 734                          }
 735                      }
 736                      else
 737                      {
 738                          // ASCII chars are starters
 739                          break;
 740                      }
 741                  }
 742  
 743  
 744                  // STEP 4: Sort and combine
 745  
 746                  // Here we sort...
 747                  $k_max = $k + sizeof($utf_seq);
 748  
 749                  if (!$k && $k_max == 1)
 750                  {
 751                      // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop
 752                          // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases
 753  //                        if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))
 754  //                        {
 755                          $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];
 756                          $tmp_pos = $pos;
 757  //                        }
 758  
 759                      continue;
 760                  }
 761  
 762                  // ...there we combine
 763                  if (isset($utf_combining_class[$utf_seq[$k]]))
 764                  {
 765                      $starter = $nf_seq = '';
 766                  }
 767                  else
 768                  {
 769                      $starter = $utf_seq[$k++];
 770                      $nf_seq = '';
 771                  }
 772                  $utf_sort = array();
 773  
 774                  // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine
 775                  // at the end of the string without altering it
 776                  $utf_seq[] = '';
 777  
 778                  do
 779                  {
 780                      $utf_char = $utf_seq[$k++];
 781  
 782                      if (isset($utf_combining_class[$utf_char]))
 783                      {
 784                          $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
 785                      }
 786                      else
 787                      {
 788                          if (empty($utf_sort))
 789                          {
 790                              // No combining characters... check for a composite of the two starters
 791                              if (isset($utf_canonical_comp[$starter . $utf_char]))
 792                              {
 793                                  // Good ol' composite character
 794                                  $starter = $utf_canonical_comp[$starter . $utf_char];
 795                              }
 796                              else if (isset($utf_jamo_type[$utf_char]))
 797                              {
 798                                  // Current char is a composable jamo
 799                                  if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)
 800                                  {
 801                                      // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo
 802                                      if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)
 803                                      {
 804                                          // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)
 805                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];
 806                                          ++$k;
 807                                      }
 808                                      else
 809                                      {
 810                                          // L+V jamos, combine to a LV Hangul syllable
 811                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];
 812                                      }
 813  
 814                                      $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
 815                                  }
 816                                  else
 817                                  {
 818                                      // Non-composable jamo, just add it to the sequence
 819                                      $nf_seq .= $starter;
 820                                      $starter = $utf_char;
 821                                  }
 822                              }
 823                              else
 824                              {
 825                                  // No composite, just add the first starter to the sequence then continue with the other one
 826                                  $nf_seq .= $starter;
 827                                  $starter = $utf_char;
 828                              }
 829                          }
 830                          else
 831                          {
 832                              ksort($utf_sort);
 833  
 834                              // For each class of combining characters
 835                              foreach ($utf_sort as $cc => $utf_chars)
 836                              {
 837                                  $j = 0;
 838  
 839                                  do
 840                                  {
 841                                      // Look for a composite
 842                                      if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))
 843                                      {
 844                                          // Found a composite, replace the starter
 845                                          $starter = $utf_canonical_comp[$starter . $utf_chars[$j]];
 846                                          unset($utf_sort[$cc][$j]);
 847                                      }
 848                                      else
 849                                      {
 850                                          // No composite, all following characters in that class are blocked
 851                                          break;
 852                                      }
 853                                  }
 854                                  while (isset($utf_sort[$cc][++$j]));
 855                              }
 856  
 857                              // Add the starter to the normalized sequence, followed by non-starters in canonical order
 858                              $nf_seq .= $starter;
 859  
 860                              foreach ($utf_sort as $utf_chars)
 861                              {
 862                                  if (!empty($utf_chars))
 863                                  {
 864                                      $nf_seq .= implode('', $utf_chars);
 865                                  }
 866                              }
 867  
 868                              // Reset the array and go on
 869                              $utf_sort = array();
 870                              $starter = $utf_char;
 871                          }
 872                      }
 873                  }
 874                  while ($k <= $k_max);
 875  
 876                  $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;
 877                  $tmp_pos = $pos;
 878              }
 879              else
 880              {
 881                  // Only a ASCII char can make the program get here
 882                  //
 883                  // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().
 884                  //
 885                  // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on
 886                  // multi-byte text (where the only ASCII chars are spaces and punctuation)
 887                  if (++$pos != $len)
 888                  {
 889                      if ($str[$pos] < "\x80")
 890                      {
 891                          $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
 892                          $buffer[++$i & 7] = $str[$pos - 1];
 893                      }
 894                      else
 895                      {
 896                          $buffer[++$i & 7] = $c;
 897                      }
 898                  }
 899              }
 900          }
 901          while ($pos < $len);
 902  
 903          // Now is time to return the string
 904          if ($tmp_pos)
 905          {
 906              // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version
 907              if ($tmp_pos == $len)
 908              {
 909                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
 910                  return $tmp;
 911              }
 912              else
 913              {
 914                  // The rightmost chunk of $str has not been appended to $tmp yet
 915                  return $tmp . substr($str, $tmp_pos);
 916              }
 917          }
 918  
 919          // The string was already in normal form
 920          return $str;
 921      }
 922  
 923      /**
 924      * Decompose a UTF string
 925      *
 926      * @param    string    $str        UTF string
 927      * @param    integer    $pos        Position of the first UTF char (in bytes)
 928      * @param    integer    $len        Length of the string (in bytes)
 929      * @param    array    $decomp_map    Decomposition mapping, passed by reference but never modified
 930      * @return    string                The string, decomposed and sorted canonically
 931      *
 932      * @access    private
 933      */
 934  	function decompose($str, $pos, $len, &$decomp_map)
 935      {
 936          global $utf_combining_class, $utf_canonical_decomp, $phpbb_root_path;
 937  
 938          // Load some commonly-used tables
 939          if (!isset($utf_combining_class))
 940          {
 941              include ($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.php');
 942          }
 943  
 944          // UTF char length array
 945          $utf_len_mask = array(
 946              // Leading bytes masks
 947              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
 948              // Trailing bytes masks
 949              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
 950          );
 951  
 952          // Some extra checks are triggered on the first byte of a UTF sequence
 953          $extra_check = array(
 954              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
 955              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
 956              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
 957          );
 958  
 959          // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:
 960          //   - 2-byte: 110? ???? 10?? ????
 961          //   - 3-byte: 1110 ???? 10?? ???? 10?? ????
 962          //   - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
 963          // Note that 5- and 6- byte sequences are automatically discarded
 964          $utf_validation_mask = array(
 965              2    => "\xE0\xC0",
 966              3    => "\xF0\xC0\xC0",
 967              4    => "\xF8\xC0\xC0\xC0"
 968          );
 969  
 970          $utf_validation_check = array(
 971              2    => "\xC0\x80",
 972              3    => "\xE0\x80\x80",
 973              4    => "\xF0\x80\x80\x80"
 974          );
 975  
 976          $tmp = '';
 977          $starter_pos = $pos;
 978          $tmp_pos = $last_cc = $sort = $dump = 0;
 979          $utf_sort = array();
 980  
 981  
 982          // Main loop
 983          do
 984          {
 985              // STEP 0: Capture the current char
 986  
 987              $cur_mask = $str[$pos] & "\xF0";
 988              if (isset($utf_len_mask[$cur_mask]))
 989              {
 990                  if ($utf_len = $utf_len_mask[$cur_mask])
 991                  {
 992                      // Multibyte char
 993                      $utf_char = substr($str, $pos, $utf_len);
 994                      $pos += $utf_len;
 995                  }
 996                  else
 997                  {
 998                      // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode
 999                      // replacement char and we will advance the cursor
1000                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
1001  
1002                      if ($dump)
1003                      {
1004                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1005  
1006                          // Dump combiners
1007                          if (!empty($utf_sort))
1008                          {
1009                              if ($sort)
1010                              {
1011                                  ksort($utf_sort);
1012                              }
1013  
1014                              foreach($utf_sort as $utf_chars)
1015                              {
1016                                  $tmp .= implode('', $utf_chars);
1017                              }
1018                          }
1019  
1020                          $tmp .= str_repeat(UTF8_REPLACEMENT, $spn);
1021                          $dump = $sort = 0;
1022                      }
1023                      else
1024                      {
1025                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
1026                      }
1027  
1028                      $pos += $spn;
1029                      $tmp_pos = $starter_pos = $pos;
1030  
1031                      $utf_sort = array();
1032                      $last_cc = 0;
1033  
1034                      continue;
1035                  }
1036  
1037  
1038                  // STEP 1: Decide what to do with current char
1039  
1040                  // Now, in that order:
1041                  //  - check if that character is decomposable
1042                  //  - check if that character is a non-starter
1043                  //  - check if that character requires extra checks to be performed
1044                  if (isset($decomp_map[$utf_char]))
1045                  {
1046                      // Decompose the char
1047                      $_pos = 0;
1048                      $_len = strlen($decomp_map[$utf_char]);
1049  
1050                      do
1051                      {
1052                          $c = $decomp_map[$utf_char][$_pos];
1053                          $_utf_len =& $utf_len_mask[$c & "\xF0"];
1054  
1055                          if (isset($_utf_len))
1056                          {
1057                              $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);
1058                              $_pos += $_utf_len;
1059  
1060                              if (isset($utf_combining_class[$_utf_char]))
1061                              {
1062                                  // The character decomposed to a non-starter, buffer it for sorting
1063                                  $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;
1064  
1065                                  if ($utf_combining_class[$_utf_char] < $last_cc)
1066                                  {
1067                                      // Not canonically ordered, will require sorting
1068                                      $sort = $dump = 1;
1069                                  }
1070                                  else
1071                                  {
1072                                      $dump = 1;
1073                                      $last_cc = $utf_combining_class[$_utf_char];
1074                                  }
1075                              }
1076                              else
1077                              {
1078                                  // This character decomposition contains a starter, dump the buffer and continue
1079                                  if ($dump)
1080                                  {
1081                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1082  
1083                                      // Dump combiners
1084                                      if (!empty($utf_sort))
1085                                      {
1086                                          if ($sort)
1087                                          {
1088                                              ksort($utf_sort);
1089                                          }
1090  
1091                                          foreach ($utf_sort as $utf_chars)
1092                                          {
1093                                              $tmp .= implode('', $utf_chars);
1094                                          }
1095                                      }
1096  
1097                                      $tmp .= $_utf_char;
1098                                      $dump = $sort = 0;
1099                                  }
1100                                  else
1101                                  {
1102                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;
1103                                  }
1104  
1105                                  $tmp_pos = $starter_pos = $pos;
1106                                  $utf_sort = array();
1107                                  $last_cc = 0;
1108                              }
1109                          }
1110                          else
1111                          {
1112                              // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue
1113                              ++$_pos;
1114  
1115                              if ($dump)
1116                              {
1117                                  $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1118  
1119                                  // Dump combiners
1120                                  if (!empty($utf_sort))
1121                                  {
1122                                      if ($sort)
1123                                      {
1124                                          ksort($utf_sort);
1125                                      }
1126  
1127                                      foreach ($utf_sort as $utf_chars)
1128                                      {
1129                                          $tmp .= implode('', $utf_chars);
1130                                      }
1131                                  }
1132  
1133                                  $tmp .= $c;
1134                                  $dump = $sort = 0;
1135                              }
1136                              else
1137                              {
1138                                  $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;
1139                              }
1140  
1141                              $tmp_pos = $starter_pos = $pos;
1142                              $utf_sort = array();
1143                              $last_cc = 0;
1144                          }
1145                      }
1146                      while ($_pos < $_len);
1147                  }
1148                  else if (isset($utf_combining_class[$utf_char]))
1149                  {
1150                      // Combining character
1151                      if ($utf_combining_class[$utf_char] < $last_cc)
1152                      {
1153                          // Not in canonical order
1154                          $sort = $dump = 1;
1155                      }
1156                      else
1157                      {
1158                          $last_cc = $utf_combining_class[$utf_char];
1159                      }
1160  
1161                      $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
1162                  }
1163                  else
1164                  {
1165                      // Non-decomposable starter, check out if it's a Hangul syllable
1166                      if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST)
1167                      {
1168                          // Nope, regular UTF char, check that we have the correct number of trailing bytes
1169                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
1170                          {
1171                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
1172                              // has been encoded in a five- or six- byte sequence.
1173                              // Move the cursor back to its original position then advance it to the position it should really be at
1174                              $pos -= $utf_len;
1175                              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1176  
1177                              if (!empty($utf_sort))
1178                              {
1179                                  ksort($utf_sort);
1180  
1181                                  foreach ($utf_sort as $utf_chars)
1182                                  {
1183                                      $tmp .= implode('', $utf_chars);
1184                                  }
1185                                  $utf_sort = array();
1186                              }
1187  
1188                              // Add a replacement char then another replacement char for every trailing byte.
1189                              //
1190                              // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this
1191                              $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);
1192                              $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);
1193  
1194                              $dump = $sort = 0;
1195  
1196                              $pos += $spn;
1197                              $tmp_pos = $pos;
1198                              continue;
1199                          }
1200  
1201                          if (isset($extra_check[$utf_char[0]]))
1202                          {
1203                              switch ($utf_char[0])
1204                              {
1205                                  // Note: 0xED is quite common in Korean
1206                                  case "\xED":
1207                                      if ($utf_char >= "\xED\xA0\x80")
1208                                      {
1209                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
1210                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1211  
1212                                          if (!empty($utf_sort))
1213                                          {
1214                                              ksort($utf_sort);
1215  
1216                                              foreach ($utf_sort as $utf_chars)
1217                                              {
1218                                                  $tmp .= implode('', $utf_chars);
1219                                              }
1220                                              $utf_sort = array();
1221                                          }
1222  
1223                                          $tmp .= UTF8_REPLACEMENT;
1224                                          $dump = $sort = 0;
1225  
1226                                          $tmp_pos = $starter_pos = $pos;
1227                                          continue 2;
1228                                      }
1229                                  break;
1230  
1231                                  // Note: 0xEF is quite common in Japanese
1232                                  case "\xEF":
1233                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
1234                                      {
1235                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
1236                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1237  
1238                                          if (!empty($utf_sort))
1239                                          {
1240                                              ksort($utf_sort);
1241  
1242                                              foreach ($utf_sort as $utf_chars)
1243                                              {
1244                                                  $tmp .= implode('', $utf_chars);
1245                                              }
1246                                              $utf_sort = array();
1247                                          }
1248  
1249                                          $tmp .= UTF8_REPLACEMENT;
1250                                          $dump = $sort = 0;
1251  
1252                                          $tmp_pos = $starter_pos = $pos;
1253                                          continue 2;
1254                                      }
1255                                  break;
1256  
1257                                  case "\xC0":
1258                                  case "\xC1":
1259                                      if ($utf_char <= "\xC1\xBF")
1260                                      {
1261                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
1262                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1263  
1264                                          if (!empty($utf_sort))
1265                                          {
1266                                              ksort($utf_sort);
1267  
1268                                              foreach ($utf_sort as $utf_chars)
1269                                              {
1270                                                  $tmp .= implode('', $utf_chars);
1271                                              }
1272                                              $utf_sort = array();
1273                                          }
1274  
1275                                          $tmp .= UTF8_REPLACEMENT;
1276                                          $dump = $sort = 0;
1277  
1278                                          $tmp_pos = $starter_pos = $pos;
1279                                          continue 2;
1280                                      }
1281                                  break;
1282  
1283                                  case "\xE0":
1284                                      if ($utf_char <= "\xE0\x9F\xBF")
1285                                      {
1286                                          // Unicode char U+0000..U+07FF encoded in 3 bytes
1287                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1288  
1289                                          if (!empty($utf_sort))
1290                                          {
1291                                              ksort($utf_sort);
1292  
1293                                              foreach ($utf_sort as $utf_chars)
1294                                              {
1295                                                  $tmp .= implode('', $utf_chars);
1296                                              }
1297                                              $utf_sort = array();
1298                                          }
1299  
1300                                          $tmp .= UTF8_REPLACEMENT;
1301                                          $dump = $sort = 0;
1302  
1303                                          $tmp_pos = $starter_pos = $pos;
1304                                          continue 2;
1305                                      }
1306                                  break;
1307  
1308                                  case "\xF0":
1309                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")
1310                                      {
1311                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes
1312                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1313  
1314                                          if (!empty($utf_sort))
1315                                          {
1316                                              ksort($utf_sort);
1317  
1318                                              foreach ($utf_sort as $utf_chars)
1319                                              {
1320                                                  $tmp .= implode('', $utf_chars);
1321                                              }
1322                                              $utf_sort = array();
1323                                          }
1324  
1325                                          $tmp .= UTF8_REPLACEMENT;
1326                                          $dump = $sort = 0;
1327  
1328                                          $tmp_pos = $starter_pos = $pos;
1329                                          continue 2;
1330                                      }
1331                                  break;
1332  
1333                                  default:
1334                                      if ($utf_char > UTF8_MAX)
1335                                      {
1336                                          // Out of the Unicode range
1337                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1338  
1339                                          if (!empty($utf_sort))
1340                                          {
1341                                              ksort($utf_sort);
1342  
1343                                              foreach ($utf_sort as $utf_chars)
1344                                              {
1345                                                  $tmp .= implode('', $utf_chars);
1346                                              }
1347                                              $utf_sort = array();
1348                                          }
1349  
1350                                          $tmp .= UTF8_REPLACEMENT;
1351                                          $dump = $sort = 0;
1352  
1353                                          $tmp_pos = $starter_pos = $pos;
1354                                          continue 2;
1355                                      }
1356                                  break;
1357                              }
1358                          }
1359                      }
1360                      else
1361                      {
1362                          // Hangul syllable
1363                          $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;
1364  
1365                          // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
1366                          //
1367                          // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
1368                          if ($tIndex = $idx % UNICODE_HANGUL_TCOUNT)
1369                          {
1370                              if ($tIndex < 25)
1371                              {
1372                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
1373                                  $utf_char[8] = chr(0xA7 + $tIndex);
1374                              }
1375                              else
1376                              {
1377                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
1378                                  $utf_char[8] = chr(0x67 + $tIndex);
1379                              }
1380                          }
1381                          else
1382                          {
1383                              $utf_char = "\xE1\x84\x00\xE1\x85\x00";
1384                          }
1385  
1386                          $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));
1387                          $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));
1388  
1389                          // Just like other decompositions, the resulting Jamos must be dumped to the tmp string
1390                          $dump = 1;
1391                      }
1392  
1393                      // Do we need to dump stuff to the tmp string?
1394                      if ($dump)
1395                      {
1396                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1397  
1398                          // Dump combiners
1399                          if (!empty($utf_sort))
1400                          {
1401                              if ($sort)
1402                              {
1403                                  ksort($utf_sort);
1404                              }
1405  
1406                              foreach ($utf_sort as $utf_chars)
1407                              {
1408                                  $tmp .= implode('', $utf_chars);
1409                              }
1410                          }
1411  
1412                          $tmp .= $utf_char;
1413                          $dump = $sort = 0;
1414                          $tmp_pos = $pos;
1415                      }
1416  
1417                      $last_cc = 0;
1418                      $utf_sort = array();
1419                      $starter_pos = $pos;
1420                  }
1421              }
1422              else
1423              {
1424                  // ASCII char, which happens to be a starter (as any other ASCII char)
1425                  if ($dump)
1426                  {
1427                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1428  
1429                      // Dump combiners
1430                      if (!empty($utf_sort))
1431                      {
1432                          if ($sort)
1433                          {
1434                              ksort($utf_sort);
1435                          }
1436  
1437                          foreach ($utf_sort as $utf_chars)
1438                          {
1439                              $tmp .= implode('', $utf_chars);
1440                          }
1441                      }
1442  
1443                      $tmp .= $str[$pos];
1444                      $dump = $sort = 0;
1445                      $tmp_pos = ++$pos;
1446  
1447                      $pos += strspn($str, UTF8_ASCII_RANGE, $pos);
1448                  }
1449                  else
1450                  {
1451                      $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
1452                  }
1453  
1454                  $last_cc = 0;
1455                  $utf_sort = array();
1456                  $starter_pos = $pos;
1457              }
1458          }
1459          while ($pos < $len);
1460  
1461          // Now is time to return the string
1462          if ($dump)
1463          {
1464              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1465  
1466              // Dump combiners
1467              if (!empty($utf_sort))
1468              {
1469                  if ($sort)
1470                  {
1471                      ksort($utf_sort);
1472                  }
1473  
1474                  foreach ($utf_sort as $utf_chars)
1475                  {
1476                      $tmp .= implode('', $utf_chars);
1477                  }
1478              }
1479  
1480              return $tmp;
1481  
1482          }
1483          else if ($tmp_pos)
1484          {
1485              // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version
1486              if ($tmp_pos == $len)
1487              {
1488                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
1489                  return $tmp;
1490              }
1491              else
1492              {
1493                  // The rightmost chunk of $str has not been appended to $tmp yet
1494                  return $tmp . substr($str, $tmp_pos);
1495              }
1496          }
1497  
1498          // The string was already in normal form
1499          return $str;
1500      }
1501  }
1502  
1503  ?>
PHP Cross Reference of phpBB 3.0 Beta 3

/includes/utf/ -> utf_normalizer.php (source)