[ Index ] |
PHP Cross Reference of phpBB 3.0 Beta 3 |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * 4 * @package phpBB3 5 * @version $Id: utf_tools.php,v 1.26 2006/11/12 14:29:32 naderman Exp $ 6 * @copyright (c) 2006 phpBB Group 7 * @license http://opensource.org/licenses/gpl-license.php GNU Public License 8 * 9 * @todo make sure the replacements are called correctly 10 * already done: strtolower, strtoupper, ucfirst, str_split, strrpos, strlen (hopefully!), strpos, substr 11 * remaining: clean_username, htmlentities (no longer needed for internal data?), htmlspecialchars (using charset) 12 * strspn, chr, ord 13 */ 14 15 /** 16 */ 17 if (!defined('IN_PHPBB')) 18 { 19 exit; 20 } 21 22 /** 23 * UTF-8 tools 24 * 25 * Whenever possible, these functions will try to use PHP's built-in functions or 26 * extensions, otherwise they will default to custom routines. 27 * 28 * @package phpBB3 29 */ 30 31 if (!extension_loaded('xml')) 32 { 33 /** 34 * Implementation of PHP's native utf8_encode for people without XML support 35 * This function exploits some nice things that ISO-8859-1 and UTF-8 have in common 36 * 37 * @param string $str ISO-8859-1 encoded data 38 * @return string UTF-8 encoded data 39 */ 40 function utf8_encode($str) 41 { 42 $out = ''; 43 for ($i = 0, $len = strlen($str); $i < $len; $i++) 44 { 45 $letter = $str[$i]; 46 $num = ord($letter); 47 if ($num < 0x80) 48 { 49 $out .= $letter; 50 } 51 else if ($num < 0xC0) 52 { 53 $out .= "\xC2" . $letter; 54 } 55 else 56 { 57 $out .= "\xC3" . chr($num - 64); 58 } 59 } 60 return $out; 61 } 62 63 /** 64 * Implementation of PHP's native utf8_decode for people without XML support 65 * 66 * @param string $string UTF-8 encoded data 67 * @return string ISO-8859-1 encoded data 68 */ 69 function utf8_decode($str) 70 { 71 $pos = 0; 72 $len = strlen($str); 73 $ret = ''; 74 75 while ($pos < $len) 76 { 77 $ord = ord($str[$pos]) & 0xF0; 78 if ($ord === 0xC0 || $ord === 0xD0) 79 { 80 $charval = ((ord($str[$pos]) & 0x1F) << 6) | (ord($str[$pos + 1]) & 0x3F); 81 $pos += 2; 82 $ret .= (($charval < 256) ? chr($charval) : '?'); 83 } 84 else if ($ord === 0xE0) 85 { 86 $ret .= '?'; 87 $pos += 3; 88 } 89 else if ($ord === 0xF0) 90 { 91 $ret .= '?'; 92 $pos += 4; 93 } 94 else 95 { 96 $ret .= $str[$pos]; 97 ++$pos; 98 } 99 } 100 return $ret; 101 } 102 } 103 104 // mbstring is old and has it's functions around for older versions of PHP. 105 // if mbstring is not loaded, we go into native mode. 106 if (extension_loaded('mbstring')) 107 { 108 /** 109 * UTF-8 aware alternative to strrpos 110 * Find position of last occurrence of a char in a string 111 * 112 * Notes: 113 * - offset for mb_strrpos was added in 5.2.0, we emulate if it is lower 114 */ 115 if (version_compare(phpversion(), '5.2.0', '>=')) 116 { 117 /** 118 * UTF-8 aware alternative to strrpos 119 * @ignore 120 */ 121 function utf8_strrpos($str, $needle, $offset = null) 122 { 123 // Emulate behaviour of strrpos rather than raising warning 124 if (empty($str)) 125 { 126 return false; 127 } 128 129 return mb_strrpos($str, $search); 130 } 131 } 132 else 133 { 134 /** 135 * UTF-8 aware alternative to strrpos 136 * @ignore 137 */ 138 function utf8_strrpos($str, $needle, $offset = null) 139 { 140 // offset for mb_strrpos was added in 5.2.0 141 if ($offset === false) 142 { 143 // Emulate behaviour of strrpos rather than raising warning 144 if (empty($str)) 145 { 146 return false; 147 } 148 149 return mb_strrpos($str, $search); 150 } 151 else 152 { 153 if (!is_int($offset)) 154 { 155 trigger_error('utf8_strrpos expects parameter 3 to be long', E_USER_WARNING); 156 return false; 157 } 158 159 $str = mb_substr($str, $offset); 160 161 if (false !== ($pos = mb_strrpos($str, $search))) 162 { 163 return $pos + $offset; 164 } 165 166 return false; 167 } 168 } 169 } 170 171 /** 172 * UTF-8 aware alternative to strpos 173 * @ignore 174 */ 175 function utf8_strpos($str, $needle, $offset = null) 176 { 177 if ($offset === false) 178 { 179 return mb_strpos($str, $needle); 180 } 181 else 182 { 183 return mb_strpos($str, $needle, $offset); 184 } 185 } 186 187 /** 188 * UTF-8 aware alternative to strtolower 189 * @ignore 190 */ 191 function utf8_strtolower($str) 192 { 193 return mb_strtolower($str); 194 } 195 196 /** 197 * UTF-8 aware alternative to strtoupper 198 * @ignore 199 */ 200 function utf8_strtoupper($str) 201 { 202 return mb_strtoupper($str); 203 } 204 205 /** 206 * UTF-8 aware alternative to substr 207 * @ignore 208 */ 209 function utf8_substr($str, $offset, $length = null) 210 { 211 if ($length === false) 212 { 213 return mb_substr($str, $offset); 214 } 215 else 216 { 217 return mb_substr($str, $offset, $length); 218 } 219 } 220 221 /** 222 * Return the length (in characters) of a UTF-8 string 223 * @ignore 224 */ 225 function utf8_strlen($text) 226 { 227 return mb_strlen($text, 'utf-8'); 228 } 229 } 230 else 231 { 232 /** 233 * UTF-8 aware alternative to strrpos 234 * Find position of last occurrence of a char in a string 235 * 236 * @author Harry Fuecks 237 * @param string haystack 238 * @param string needle 239 * @param integer (optional) offset (from left) 240 * @return mixed integer position or FALSE on failure 241 */ 242 function utf8_strrpos($str, $needle, $offset = null) 243 { 244 if (is_null($offset)) 245 { 246 $ar = explode($needle, $str); 247 248 if (sizeof($ar) > 1) 249 { 250 // Pop off the end of the string where the last match was made 251 array_pop($ar); 252 $str = join($needle, $ar); 253 254 return utf8_strlen($str); 255 } 256 return false; 257 } 258 else 259 { 260 if (!is_int($offset)) 261 { 262 trigger_error('utf8_strrpos expects parameter 3 to be long', E_USER_WARNING); 263 return false; 264 } 265 266 $str = utf8_substr($str, $offset); 267 268 if (false !== ($pos = utf8_strrpos($str, $needle))) 269 { 270 return $pos + $offset; 271 } 272 273 return false; 274 } 275 } 276 277 /** 278 * UTF-8 aware alternative to strpos 279 * Find position of first occurrence of a string 280 * 281 * @author Harry Fuecks 282 * @param string haystack 283 * @param string needle 284 * @param integer offset in characters (from left) 285 * @return mixed integer position or FALSE on failure 286 */ 287 function utf8_strpos($str, $needle, $offset = null) 288 { 289 if (is_null($offset)) 290 { 291 $ar = explode($needle, $str); 292 if (sizeof($ar) > 1) 293 { 294 return utf8_strlen($ar[0]); 295 } 296 return false; 297 } 298 else 299 { 300 if (!is_int($offset)) 301 { 302 trigger_error('utf8_strpos: Offset must be an integer', E_USER_ERROR); 303 return false; 304 } 305 306 $str = utf8_substr($str, $offset); 307 308 if (false !== ($pos = utf8_strpos($str, $needle))) 309 { 310 return $pos + $offset; 311 } 312 313 return false; 314 } 315 } 316 317 $UTF8_UPPER_TO_LOWER = array( 318 "\x41" => "\x61", "\x42" => "\x62", "\x43" => "\x63", "\x44" => "\x64", 319 "\x45" => "\x65", "\x46" => "\x66", "\x47" => "\x67", "\x48" => "\x68", 320 "\x49" => "\x69", "\x4A" => "\x6A", "\x4B" => "\x6B", "\x4C" => "\x6C", 321 "\x4D" => "\x6D", "\x4E" => "\x6E", "\x4F" => "\x6F", "\x50" => "\x70", 322 "\x51" => "\x71", "\x52" => "\x72", "\x53" => "\x73", "\x54" => "\x74", 323 "\x55" => "\x75", "\x56" => "\x76", "\x57" => "\x77", "\x58" => "\x78", 324 "\x59" => "\x79", "\x5A" => "\x7A", "\xC3\x80" => "\xC3\xA0", "\xC3\x81" => "\xC3\xA1", 325 "\xC3\x82" => "\xC3\xA2", "\xC3\x83" => "\xC3\xA3", "\xC3\x84" => "\xC3\xA4", "\xC3\x85" => "\xC3\xA5", 326 "\xC3\x86" => "\xC3\xA6", "\xC3\x87" => "\xC3\xA7", "\xC3\x88" => "\xC3\xA8", "\xC3\x89" => "\xC3\xA9", 327 "\xC3\x8A" => "\xC3\xAA", "\xC3\x8B" => "\xC3\xAB", "\xC3\x8C" => "\xC3\xAC", "\xC3\x8D" => "\xC3\xAD", 328 "\xC3\x8E" => "\xC3\xAE", "\xC3\x8F" => "\xC3\xAF", "\xC3\x90" => "\xC3\xB0", "\xC3\x91" => "\xC3\xB1", 329 "\xC3\x92" => "\xC3\xB2", "\xC3\x93" => "\xC3\xB3", "\xC3\x94" => "\xC3\xB4", "\xC3\x95" => "\xC3\xB5", 330 "\xC3\x96" => "\xC3\xB6", "\xC3\x98" => "\xC3\xB8", "\xC3\x99" => "\xC3\xB9", "\xC3\x9A" => "\xC3\xBA", 331 "\xC3\x9B" => "\xC3\xBB", "\xC3\x9C" => "\xC3\xBC", "\xC3\x9D" => "\xC3\xBD", "\xC3\x9E" => "\xC3\xBE", 332 "\xC4\x80" => "\xC4\x81", "\xC4\x82" => "\xC4\x83", "\xC4\x84" => "\xC4\x85", "\xC4\x86" => "\xC4\x87", 333 "\xC4\x88" => "\xC4\x89", "\xC4\x8A" => "\xC4\x8B", "\xC4\x8C" => "\xC4\x8D", "\xC4\x8E" => "\xC4\x8F", 334 "\xC4\x90" => "\xC4\x91", "\xC4\x92" => "\xC4\x93", "\xC4\x96" => "\xC4\x97", "\xC4\x98" => "\xC4\x99", 335 "\xC4\x9A" => "\xC4\x9B", "\xC4\x9C" => "\xC4\x9D", "\xC4\x9E" => "\xC4\x9F", "\xC4\xA0" => "\xC4\xA1", 336 "\xC4\xA2" => "\xC4\xA3", "\xC4\xA4" => "\xC4\xA5", "\xC4\xA6" => "\xC4\xA7", "\xC4\xA8" => "\xC4\xA9", 337 "\xC4\xAA" => "\xC4\xAB", "\xC4\xAE" => "\xC4\xAF", "\xC4\xB4" => "\xC4\xB5", "\xC4\xB6" => "\xC4\xB7", 338 "\xC4\xB9" => "\xC4\xBA", "\xC4\xBB" => "\xC4\xBC", "\xC4\xBD" => "\xC4\xBE", "\xC5\x81" => "\xC5\x82", 339 "\xC5\x83" => "\xC5\x84", "\xC5\x85" => "\xC5\x86", "\xC5\x87" => "\xC5\x88", "\xC5\x8A" => "\xC5\x8B", 340 "\xC5\x8C" => "\xC5\x8D", "\xC5\x90" => "\xC5\x91", "\xC5\x94" => "\xC5\x95", "\xC5\x96" => "\xC5\x97", 341 "\xC5\x98" => "\xC5\x99", "\xC5\x9A" => "\xC5\x9B", "\xC5\x9C" => "\xC5\x9D", "\xC5\x9E" => "\xC5\x9F", 342 "\xC5\xA0" => "\xC5\xA1", "\xC5\xA2" => "\xC5\xA3", "\xC5\xA4" => "\xC5\xA5", "\xC5\xA6" => "\xC5\xA7", 343 "\xC5\xA8" => "\xC5\xA9", "\xC5\xAA" => "\xC5\xAB", "\xC5\xAC" => "\xC5\xAD", "\xC5\xAE" => "\xC5\xAF", 344 "\xC5\xB0" => "\xC5\xB1", "\xC5\xB2" => "\xC5\xB3", "\xC5\xB4" => "\xC5\xB5", "\xC5\xB6" => "\xC5\xB7", 345 "\xC5\xB8" => "\xC3\xBF", "\xC5\xB9" => "\xC5\xBA", "\xC5\xBB" => "\xC5\xBC", "\xC5\xBD" => "\xC5\xBE", 346 "\xC6\xA0" => "\xC6\xA1", "\xC6\xAF" => "\xC6\xB0", "\xC8\x98" => "\xC8\x99", "\xC8\x9A" => "\xC8\x9B", 347 "\xCE\x86" => "\xCE\xAC", "\xCE\x88" => "\xCE\xAD", "\xCE\x89" => "\xCE\xAE", "\xCE\x8A" => "\xCE\xAF", 348 "\xCE\x8C" => "\xCF\x8C", "\xCE\x8E" => "\xCF\x8D", "\xCE\x8F" => "\xCF\x8E", "\xCE\x91" => "\xCE\xB1", 349 "\xCE\x92" => "\xCE\xB2", "\xCE\x93" => "\xCE\xB3", "\xCE\x94" => "\xCE\xB4", "\xCE\x95" => "\xCE\xB5", 350 "\xCE\x96" => "\xCE\xB6", "\xCE\x97" => "\xCE\xB7", "\xCE\x98" => "\xCE\xB8", "\xCE\x99" => "\xCE\xB9", 351 "\xCE\x9A" => "\xCE\xBA", "\xCE\x9B" => "\xCE\xBB", "\xCE\x9C" => "\xCE\xBC", "\xCE\x9D" => "\xCE\xBD", 352 "\xCE\x9E" => "\xCE\xBE", "\xCE\x9F" => "\xCE\xBF", "\xCE\xA0" => "\xCF\x80", "\xCE\xA1" => "\xCF\x81", 353 "\xCE\xA3" => "\xCF\x83", "\xCE\xA4" => "\xCF\x84", "\xCE\xA5" => "\xCF\x85", "\xCE\xA6" => "\xCF\x86", 354 "\xCE\xA7" => "\xCF\x87", "\xCE\xA8" => "\xCF\x88", "\xCE\xA9" => "\xCF\x89", "\xCE\xAA" => "\xCF\x8A", 355 "\xCE\xAB" => "\xCF\x8B", "\xD0\x81" => "\xD1\x91", "\xD0\x82" => "\xD1\x92", "\xD0\x83" => "\xD1\x93", 356 "\xD0\x84" => "\xD1\x94", "\xD0\x85" => "\xD1\x95", "\xD0\x86" => "\xD1\x96", "\xD0\x87" => "\xD1\x97", 357 "\xD0\x88" => "\xD1\x98", "\xD0\x89" => "\xD1\x99", "\xD0\x8A" => "\xD1\x9A", "\xD0\x8B" => "\xD1\x9B", 358 "\xD0\x8C" => "\xD1\x9C", "\xD0\x8E" => "\xD1\x9E", "\xD0\x8F" => "\xD1\x9F", "\xD0\x90" => "\xD0\xB0", 359 "\xD0\x91" => "\xD0\xB1", "\xD0\x92" => "\xD0\xB2", "\xD0\x93" => "\xD0\xB3", "\xD0\x94" => "\xD0\xB4", 360 "\xD0\x95" => "\xD0\xB5", "\xD0\x96" => "\xD0\xB6", "\xD0\x97" => "\xD0\xB7", "\xD0\x98" => "\xD0\xB8", 361 "\xD0\x99" => "\xD0\xB9", "\xD0\x9A" => "\xD0\xBA", "\xD0\x9B" => "\xD0\xBB", "\xD0\x9C" => "\xD0\xBC", 362 "\xD0\x9D" => "\xD0\xBD", "\xD0\x9E" => "\xD0\xBE", "\xD0\x9F" => "\xD0\xBF", "\xD0\xA0" => "\xD1\x80", 363 "\xD0\xA1" => "\xD1\x81", "\xD0\xA2" => "\xD1\x82", "\xD0\xA3" => "\xD1\x83", "\xD0\xA4" => "\xD1\x84", 364 "\xD0\xA5" => "\xD1\x85", "\xD0\xA6" => "\xD1\x86", "\xD0\xA7" => "\xD1\x87", "\xD0\xA8" => "\xD1\x88", 365 "\xD0\xA9" => "\xD1\x89", "\xD0\xAA" => "\xD1\x8A", "\xD0\xAB" => "\xD1\x8B", "\xD0\xAC" => "\xD1\x8C", 366 "\xD0\xAD" => "\xD1\x8D", "\xD0\xAE" => "\xD1\x8E", "\xD0\xAF" => "\xD1\x8F", "\xD2\x90" => "\xD2\x91", 367 "\xE1\xB8\x82" => "\xE1\xB8\x83", "\xE1\xB8\x8A" => "\xE1\xB8\x8B", "\xE1\xB8\x9E" => "\xE1\xB8\x9F", "\xE1\xB9\x80" => "\xE1\xB9\x81", 368 "\xE1\xB9\x96" => "\xE1\xB9\x97", "\xE1\xB9\xA0" => "\xE1\xB9\xA1", "\xE1\xB9\xAA" => "\xE1\xB9\xAB", "\xE1\xBA\x80" => "\xE1\xBA\x81", 369 "\xE1\xBA\x82" => "\xE1\xBA\x83", "\xE1\xBA\x84" => "\xE1\xBA\x85", "\xE1\xBB\xB2" => "\xE1\xBB\xB3" 370 ); 371 372 $UTF8_LOWER_TO_UPPER = array( 373 "\x61" => "\x41", "\x62" => "\x42", "\x63" => "\x43", "\x64" => "\x44", 374 "\x65" => "\x45", "\x66" => "\x46", "\x67" => "\x47", "\x68" => "\x48", 375 "\x69" => "\x49", "\x6A" => "\x4A", "\x6B" => "\x4B", "\x6C" => "\x4C", 376 "\x6D" => "\x4D", "\x6E" => "\x4E", "\x6F" => "\x4F", "\x70" => "\x50", 377 "\x71" => "\x51", "\x72" => "\x52", "\x73" => "\x53", "\x74" => "\x54", 378 "\x75" => "\x55", "\x76" => "\x56", "\x77" => "\x57", "\x78" => "\x58", 379 "\x79" => "\x59", "\x7A" => "\x5A", "\xC3\xA0" => "\xC3\x80", "\xC3\xA1" => "\xC3\x81", 380 "\xC3\xA2" => "\xC3\x82", "\xC3\xA3" => "\xC3\x83", "\xC3\xA4" => "\xC3\x84", "\xC3\xA5" => "\xC3\x85", 381 "\xC3\xA6" => "\xC3\x86", "\xC3\xA7" => "\xC3\x87", "\xC3\xA8" => "\xC3\x88", "\xC3\xA9" => "\xC3\x89", 382 "\xC3\xAA" => "\xC3\x8A", "\xC3\xAB" => "\xC3\x8B", "\xC3\xAC" => "\xC3\x8C", "\xC3\xAD" => "\xC3\x8D", 383 "\xC3\xAE" => "\xC3\x8E", "\xC3\xAF" => "\xC3\x8F", "\xC3\xB0" => "\xC3\x90", "\xC3\xB1" => "\xC3\x91", 384 "\xC3\xB2" => "\xC3\x92", "\xC3\xB3" => "\xC3\x93", "\xC3\xB4" => "\xC3\x94", "\xC3\xB5" => "\xC3\x95", 385 "\xC3\xB6" => "\xC3\x96", "\xC3\xB8" => "\xC3\x98", "\xC3\xB9" => "\xC3\x99", "\xC3\xBA" => "\xC3\x9A", 386 "\xC3\xBB" => "\xC3\x9B", "\xC3\xBC" => "\xC3\x9C", "\xC3\xBD" => "\xC3\x9D", "\xC3\xBE" => "\xC3\x9E", 387 "\xC3\xBF" => "\xC5\xB8", "\xC4\x81" => "\xC4\x80", "\xC4\x83" => "\xC4\x82", "\xC4\x85" => "\xC4\x84", 388 "\xC4\x87" => "\xC4\x86", "\xC4\x89" => "\xC4\x88", "\xC4\x8B" => "\xC4\x8A", "\xC4\x8D" => "\xC4\x8C", 389 "\xC4\x8F" => "\xC4\x8E", "\xC4\x91" => "\xC4\x90", "\xC4\x93" => "\xC4\x92", "\xC4\x97" => "\xC4\x96", 390 "\xC4\x99" => "\xC4\x98", "\xC4\x9B" => "\xC4\x9A", "\xC4\x9D" => "\xC4\x9C", "\xC4\x9F" => "\xC4\x9E", 391 "\xC4\xA1" => "\xC4\xA0", "\xC4\xA3" => "\xC4\xA2", "\xC4\xA5" => "\xC4\xA4", "\xC4\xA7" => "\xC4\xA6", 392 "\xC4\xA9" => "\xC4\xA8", "\xC4\xAB" => "\xC4\xAA", "\xC4\xAF" => "\xC4\xAE", "\xC4\xB5" => "\xC4\xB4", 393 "\xC4\xB7" => "\xC4\xB6", "\xC4\xBA" => "\xC4\xB9", "\xC4\xBC" => "\xC4\xBB", "\xC4\xBE" => "\xC4\xBD", 394 "\xC5\x82" => "\xC5\x81", "\xC5\x84" => "\xC5\x83", "\xC5\x86" => "\xC5\x85", "\xC5\x88" => "\xC5\x87", 395 "\xC5\x8B" => "\xC5\x8A", "\xC5\x8D" => "\xC5\x8C", "\xC5\x91" => "\xC5\x90", "\xC5\x95" => "\xC5\x94", 396 "\xC5\x97" => "\xC5\x96", "\xC5\x99" => "\xC5\x98", "\xC5\x9B" => "\xC5\x9A", "\xC5\x9D" => "\xC5\x9C", 397 "\xC5\x9F" => "\xC5\x9E", "\xC5\xA1" => "\xC5\xA0", "\xC5\xA3" => "\xC5\xA2", "\xC5\xA5" => "\xC5\xA4", 398 "\xC5\xA7" => "\xC5\xA6", "\xC5\xA9" => "\xC5\xA8", "\xC5\xAB" => "\xC5\xAA", "\xC5\xAD" => "\xC5\xAC", 399 "\xC5\xAF" => "\xC5\xAE", "\xC5\xB1" => "\xC5\xB0", "\xC5\xB3" => "\xC5\xB2", "\xC5\xB5" => "\xC5\xB4", 400 "\xC5\xB7" => "\xC5\xB6", "\xC5\xBA" => "\xC5\xB9", "\xC5\xBC" => "\xC5\xBB", "\xC5\xBE" => "\xC5\xBD", 401 "\xC6\xA1" => "\xC6\xA0", "\xC6\xB0" => "\xC6\xAF", "\xC8\x99" => "\xC8\x98", "\xC8\x9B" => "\xC8\x9A", 402 "\xCE\xAC" => "\xCE\x86", "\xCE\xAD" => "\xCE\x88", "\xCE\xAE" => "\xCE\x89", "\xCE\xAF" => "\xCE\x8A", 403 "\xCE\xB1" => "\xCE\x91", "\xCE\xB2" => "\xCE\x92", "\xCE\xB3" => "\xCE\x93", "\xCE\xB4" => "\xCE\x94", 404 "\xCE\xB5" => "\xCE\x95", "\xCE\xB6" => "\xCE\x96", "\xCE\xB7" => "\xCE\x97", "\xCE\xB8" => "\xCE\x98", 405 "\xCE\xB9" => "\xCE\x99", "\xCE\xBA" => "\xCE\x9A", "\xCE\xBB" => "\xCE\x9B", "\xCE\xBC" => "\xCE\x9C", 406 "\xCE\xBD" => "\xCE\x9D", "\xCE\xBE" => "\xCE\x9E", "\xCE\xBF" => "\xCE\x9F", "\xCF\x80" => "\xCE\xA0", 407 "\xCF\x81" => "\xCE\xA1", "\xCF\x83" => "\xCE\xA3", "\xCF\x84" => "\xCE\xA4", "\xCF\x85" => "\xCE\xA5", 408 "\xCF\x86" => "\xCE\xA6", "\xCF\x87" => "\xCE\xA7", "\xCF\x88" => "\xCE\xA8", "\xCF\x89" => "\xCE\xA9", 409 "\xCF\x8A" => "\xCE\xAA", "\xCF\x8B" => "\xCE\xAB", "\xCF\x8C" => "\xCE\x8C", "\xCF\x8D" => "\xCE\x8E", 410 "\xCF\x8E" => "\xCE\x8F", "\xD0\xB0" => "\xD0\x90", "\xD0\xB1" => "\xD0\x91", "\xD0\xB2" => "\xD0\x92", 411 "\xD0\xB3" => "\xD0\x93", "\xD0\xB4" => "\xD0\x94", "\xD0\xB5" => "\xD0\x95", "\xD0\xB6" => "\xD0\x96", 412 "\xD0\xB7" => "\xD0\x97", "\xD0\xB8" => "\xD0\x98", "\xD0\xB9" => "\xD0\x99", "\xD0\xBA" => "\xD0\x9A", 413 "\xD0\xBB" => "\xD0\x9B", "\xD0\xBC" => "\xD0\x9C", "\xD0\xBD" => "\xD0\x9D", "\xD0\xBE" => "\xD0\x9E", 414 "\xD0\xBF" => "\xD0\x9F", "\xD1\x80" => "\xD0\xA0", "\xD1\x81" => "\xD0\xA1", "\xD1\x82" => "\xD0\xA2", 415 "\xD1\x83" => "\xD0\xA3", "\xD1\x84" => "\xD0\xA4", "\xD1\x85" => "\xD0\xA5", "\xD1\x86" => "\xD0\xA6", 416 "\xD1\x87" => "\xD0\xA7", "\xD1\x88" => "\xD0\xA8", "\xD1\x89" => "\xD0\xA9", "\xD1\x8A" => "\xD0\xAA", 417 "\xD1\x8B" => "\xD0\xAB", "\xD1\x8C" => "\xD0\xAC", "\xD1\x8D" => "\xD0\xAD", "\xD1\x8E" => "\xD0\xAE", 418 "\xD1\x8F" => "\xD0\xAF", "\xD1\x91" => "\xD0\x81", "\xD1\x92" => "\xD0\x82", "\xD1\x93" => "\xD0\x83", 419 "\xD1\x94" => "\xD0\x84", "\xD1\x95" => "\xD0\x85", "\xD1\x96" => "\xD0\x86", "\xD1\x97" => "\xD0\x87", 420 "\xD1\x98" => "\xD0\x88", "\xD1\x99" => "\xD0\x89", "\xD1\x9A" => "\xD0\x8A", "\xD1\x9B" => "\xD0\x8B", 421 "\xD1\x9C" => "\xD0\x8C", "\xD1\x9E" => "\xD0\x8E", "\xD1\x9F" => "\xD0\x8F", "\xD2\x91" => "\xD2\x90", 422 "\xE1\xB8\x83" => "\xE1\xB8\x82", "\xE1\xB8\x8B" => "\xE1\xB8\x8A", "\xE1\xB8\x9F" => "\xE1\xB8\x9E", "\xE1\xB9\x81" => "\xE1\xB9\x80", 423 "\xE1\xB9\x97" => "\xE1\xB9\x96", "\xE1\xB9\xA1" => "\xE1\xB9\xA0", "\xE1\xB9\xAB" => "\xE1\xB9\xAA", "\xE1\xBA\x81" => "\xE1\xBA\x80", 424 "\xE1\xBA\x83" => "\xE1\xBA\x82", "\xE1\xBA\x85" => "\xE1\xBA\x84", "\xE1\xBB\xB3" => "\xE1\xBB\xB2" 425 ); 426 427 /** 428 * UTF-8 aware alternative to strtolower 429 * Make a string lowercase 430 * Note: The concept of a characters "case" only exists is some alphabets 431 * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does 432 * not exist in the Chinese alphabet, for example. See Unicode Standard 433 * Annex #21: Case Mappings 434 * 435 * @param string 436 * @return string string in lowercase 437 */ 438 function utf8_strtolower($string) 439 { 440 global $UTF8_UPPER_TO_LOWER; 441 442 return strtr($string, $UTF8_UPPER_TO_LOWER); 443 } 444 445 /** 446 * UTF-8 aware alternative to strtoupper 447 * Make a string uppercase 448 * Note: The concept of a characters "case" only exists is some alphabets 449 * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does 450 * not exist in the Chinese alphabet, for example. See Unicode Standard 451 * Annex #21: Case Mappings 452 * 453 * @param string 454 * @return string string in uppercase 455 */ 456 function utf8_strtoupper($string) 457 { 458 global $UTF8_LOWER_TO_UPPER; 459 460 return strtr($string, $UTF8_LOWER_TO_UPPER); 461 } 462 463 /** 464 * UTF-8 aware alternative to substr 465 * Return part of a string given character offset (and optionally length) 466 * 467 * Note arguments: comparied to substr - if offset or length are 468 * not integers, this version will not complain but rather massages them 469 * into an integer. 470 * 471 * Note on returned values: substr documentation states false can be 472 * returned in some cases (e.g. offset > string length) 473 * mb_substr never returns false, it will return an empty string instead. 474 * This adopts the mb_substr approach 475 * 476 * Note on implementation: PCRE only supports repetitions of less than 477 * 65536, in order to accept up to MAXINT values for offset and length, 478 * we'll repeat a group of 65535 characters when needed. 479 * 480 * Note on implementation: calculating the number of characters in the 481 * string is a relatively expensive operation, so we only carry it out when 482 * necessary. It isn't necessary for +ve offsets and no specified length 483 * 484 * @author Chris Smith<chris@jalakai.co.uk> 485 * @param string 486 * @param integer number of UTF-8 characters offset (from left) 487 * @param integer (optional) length in UTF-8 characters from offset 488 * @return mixed string or FALSE if failure 489 */ 490 function utf8_substr($str, $offset, $length = NULL) 491 { 492 // generates E_NOTICE 493 // for PHP4 objects, but not PHP5 objects 494 $str = (string) $str; 495 $offset = (int) $offset; 496 if (!is_null($length)) 497 { 498 $length = (int) $length; 499 } 500 501 // handle trivial cases 502 if ($length === 0 || ($offset < 0 && $length < 0 && $length < $offset)) 503 { 504 return ''; 505 } 506 507 // normalise negative offsets (we could use a tail 508 // anchored pattern, but they are horribly slow!) 509 if ($offset < 0) 510 { 511 // see notes 512 $strlen = utf8_strlen($str); 513 $offset = $strlen + $offset; 514 if ($offset < 0) 515 { 516 $offset = 0; 517 } 518 } 519 520 $op = ''; 521 $lp = ''; 522 523 // establish a pattern for offset, a 524 // non-captured group equal in length to offset 525 if ($offset > 0) 526 { 527 $ox = (int) ($offset / 65535); 528 $oy = $offset % 65535; 529 530 if ($ox) 531 { 532 $op = '(?:.{65535}){' . $ox . '}'; 533 } 534 535 $op = '^(?:' . $op . '.{' . $oy . '})'; 536 } 537 else 538 { 539 // offset == 0; just anchor the pattern 540 $op = '^'; 541 } 542 543 // establish a pattern for length 544 if (is_null($length)) 545 { 546 // the rest of the string 547 $lp = '(.*)$'; 548 } 549 else 550 { 551 if (!isset($strlen)) 552 { 553 // see notes 554 $strlen = utf8_strlen($str); 555 } 556 557 // another trivial case 558 if ($offset > $strlen) 559 { 560 return ''; 561 } 562 563 if ($length > 0) 564 { 565 // reduce any length that would 566 // go passed the end of the string 567 $length = min($strlen - $offset, $length); 568 569 $lx = (int) ($length / 65535); 570 $ly = $length % 65535; 571 572 // negative length requires a captured group 573 // of length characters 574 if ($lx) 575 { 576 $lp = '(?:.{65535}){' . $lx . '}'; 577 } 578 $lp = '(' . $lp . '.{'. $ly . '})'; 579 } 580 else if ($length < 0) 581 { 582 if ($length < ($offset - $strlen)) 583 { 584 return ''; 585 } 586 587 $lx = (int)((-$length) / 65535); 588 $ly = (-$length) % 65535; 589 590 // negative length requires ... capture everything 591 // except a group of -length characters 592 // anchored at the tail-end of the string 593 if ($lx) 594 { 595 $lp = '(?:.{65535}){' . $lx . '}'; 596 } 597 $lp = '(.*)(?:' . $lp . '.{' . $ly . '})$'; 598 } 599 } 600 601 if (!preg_match('#' . $op . $lp . '#us', $str, $match)) 602 { 603 return ''; 604 } 605 606 return $match[1]; 607 } 608 609 /** 610 * Return the length (in characters) of a UTF-8 string 611 * 612 * @param string $text UTF-8 string 613 * @return integer Length (in chars) of given string 614 */ 615 function utf8_strlen($text) 616 { 617 // Since utf8_decode is replacing multibyte characters to ? strlen works fine 618 return strlen(utf8_decode($text)); 619 } 620 } 621 622 /** 623 * UTF-8 aware alternative to str_split 624 * Convert a string to an array 625 * 626 * @author Harry Fuecks 627 * @param string UTF-8 encoded 628 * @param int number to characters to split string by 629 * @return string characters in string reverses 630 */ 631 function utf8_str_split($str, $split_len = 1) 632 { 633 if (!preg_match('/^[0-9]+$/', $split_len) || $split_len < 1) 634 { 635 return false; 636 } 637 638 $len = utf8_strlen($str); 639 if ($len <= $split_len) 640 { 641 return array($str); 642 } 643 644 preg_match_all('/.{' . $split_len . '}|[^\x00]{1,' . $split_len . '}$/us', $str, $ar); 645 return $ar[0]; 646 } 647 648 /** 649 * UTF-8 aware alternative to strcspn 650 * Find length of initial segment not matching mask 651 * 652 * @author Harry Fuecks 653 * @param string 654 * @return int 655 */ 656 function utf8_strspn($str, $mask, $start = null, $length = null) 657 { 658 $mask = preg_replace('!([\\\\\\-\\]\\[/^])!', '\\\$1}', $mask); 659 660 if ($start !== null || $length !== null) 661 { 662 $str = utf8_substr($str, $start, $length); 663 } 664 665 preg_match('/^[' . $mask . ']+/u', $str, $matches); 666 667 if (isset($matches[0])) 668 { 669 return utf8_strlen($matches[0]); 670 } 671 672 return 0; 673 } 674 675 /** 676 * UTF-8 aware alternative to ucfirst 677 * Make a string's first character uppercase 678 * 679 * @author Harry Fuecks 680 * @param string 681 * @return string with first character as upper case (if applicable) 682 */ 683 function utf8_ucfirst($str) 684 { 685 switch (utf8_strlen($str)) 686 { 687 case 0: 688 return ''; 689 break; 690 691 case 1: 692 return utf8_strtoupper($str); 693 break; 694 695 default: 696 preg_match('/^(.{1})(.*)$/us', $str, $matches); 697 return utf8_strtoupper($matches[1]) . $matches[2]; 698 break; 699 } 700 } 701 702 /** 703 * Recode a string to UTF-8 704 * 705 * If the encoding is not supported, the string is returned as-is 706 * 707 * @param string $string Original string 708 * @param string $encoding Original encoding (lowered) 709 * @return string The string, encoded in UTF-8 710 */ 711 function utf8_recode($string, $encoding) 712 { 713 $encoding = strtolower($encoding); 714 715 if ($encoding == 'utf-8' || !is_string($string) || !isset($string[0])) 716 { 717 return $string; 718 } 719 720 // start with something simple 721 if ($encoding == 'iso-8859-1') 722 { 723 return utf8_encode($string); 724 } 725 726 // First, try iconv() 727 if (function_exists('iconv')) 728 { 729 $ret = @iconv($encoding, 'utf-8', $string); 730 731 if (isset($ret[0])) 732 { 733 return $ret; 734 } 735 } 736 737 // Try the mb_string extension 738 if (function_exists('mb_convert_encoding')) 739 { 740 $ret = @mb_convert_encoding($string, 'utf-8', $encoding); 741 742 if (isset($ret[0])) 743 { 744 return $ret; 745 } 746 } 747 748 // Try the recode extension 749 if (function_exists('recode_string')) 750 { 751 $ret = @recode_string($encoding . '..utf-8', $string); 752 753 if (isset($ret[0])) 754 { 755 return $ret; 756 } 757 } 758 759 // If nothing works, check if we have a custom transcoder available 760 if (!preg_match('#^[a-z0-9\\-]+$#', $encoding)) 761 { 762 // Make sure the encoding name is alphanumeric, we don't want it to be abused into loading arbitrary files 763 trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR); 764 } 765 766 global $phpbb_root_path; 767 768 if (!file_exists($phpbb_root_path . 'includes/utf/data/')) 769 { 770 return $string; 771 } 772 773 die('Finish me!! ' . basename(__FILE__) . ' at line ' . __LINE__); 774 } 775 776 /** 777 * Replace all UTF-8 chars that are not in ASCII with their NCR 778 * 779 * @param string $text UTF-8 string in NFC 780 * @return string ASCII string using NCRs for non-ASCII chars 781 */ 782 function utf8_encode_ncr($text) 783 { 784 return preg_replace_callback('#[\\xC2-\\xF4][\\x80-\\xBF]{1,3}#', 'utf8_encode_ncr_callback', $text); 785 } 786 787 /** 788 * Callback used in encode_ncr() 789 * 790 * Takes a UTF-8 char and replaces it with its NCR. Attention, $m is an array 791 * 792 * @param array $m 0-based numerically indexed array passed by preg_replace_callback() 793 * @return string A HTML NCR if the character is valid, or the original string otherwise 794 */ 795 function utf8_encode_ncr_callback($m) 796 { 797 return '&#' . utf8_ord($m[0]) . ';'; 798 } 799 800 /** 801 * Enter description here... 802 * 803 * @param string $chr UTF-8 char 804 * @return integer UNICODE code point 805 */ 806 function utf8_ord($chr) 807 { 808 switch (strlen($chr)) 809 { 810 case 1: 811 return ord($chr); 812 break; 813 814 case 2: 815 return ((ord($chr[0]) & 0x1F) << 6) | (ord($chr[1]) & 0x3F); 816 break; 817 818 case 3: 819 return ((ord($chr[0]) & 0x0F) << 12) | ((ord($chr[1]) & 0x3F) << 6) | (ord($chr[2]) & 0x3F); 820 break; 821 822 case 4: 823 return ((ord($chr[0]) & 0x07) << 18) | ((ord($chr[1]) & 0x3F) << 12) | ((ord($chr[2]) & 0x3F) << 6) | (ord($chr[3]) & 0x3F); 824 break; 825 826 default: 827 return $chr; 828 } 829 } 830 831 /** 832 * Converts an NCR to a UTF-8 char 833 * 834 * @param integer $cp UNICODE code point 835 * @return string UTF-8 char 836 */ 837 function utf8_chr($cp) 838 { 839 if ($cp > 0xFFFF) 840 { 841 return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F)); 842 } 843 else if ($cp > 0x7FF) 844 { 845 return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F)); 846 } 847 else if ($cp > 0x7F) 848 { 849 return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F)); 850 } 851 else 852 { 853 return chr($cp); 854 } 855 } 856 857 /** 858 * Convert Numeric Character References to UTF-8 chars 859 * 860 * Notes: 861 * - we do not convert NCRs recursively, if you pass &#38; it will return & 862 * - we DO NOT check for the existence of the Unicode characters, therefore an entity 863 * may be converted to an inexistent codepoint 864 * 865 * @param string $text String to convert, encoded in UTF-8 (no normal form required) 866 * @return string UTF-8 string where NCRs have been replaced with the actual chars 867 */ 868 function utf8_decode_ncr($text) 869 { 870 return preg_replace_callback('/&#([0-9]{1,6}|x[0-9A-F]{1,5});/i', 'utf8_decode_ncr_callback', $text); 871 } 872 873 /** 874 * Callback used in decode_ncr() 875 * 876 * Takes a NCR (in decimal or hexadecimal) and returns a UTF-8 char. Attention, $m is an array. 877 * It will ignore most of invalid NCRs, but not all! 878 * 879 * @param array $m 0-based numerically indexed array passed by preg_replace_callback() 880 * @return string UTF-8 char 881 */ 882 function utf8_decode_ncr_callback($m) 883 { 884 $cp = (strncasecmp($m[1], 'x', 1)) ? $m[1] : hexdec(substr($m[1], 1)); 885 886 return utf8_chr($cp); 887 } 888 889 /** 890 * Takes an array of ints representing the Unicode characters and returns 891 * a UTF-8 string. 892 * 893 * @param string $text text to be case folded 894 * @param string $option determines how we will fold the cases 895 * @return string case folded text 896 */ 897 function utf8_case_fold($text, $option = 'full') 898 { 899 static $uniarray = array(); 900 global $phpbb_root_path, $phpEx; 901 902 // common is always set 903 if (!isset($uniarray['c'])) 904 { 905 $uniarray['c'] = include($phpbb_root_path . 'includes/utf/data/case_fold_c.' . $phpEx); 906 } 907 908 // only set full if we need to 909 if ($option === 'full' && !isset($uniarray['f'])) 910 { 911 $uniarray['f'] = include($phpbb_root_path . 'includes/utf/data/case_fold_f.' . $phpEx); 912 } 913 914 // only set simple if we need to 915 if ($option !== 'full' && !isset($uniarray['s'])) 916 { 917 $uniarray['s'] = include($phpbb_root_path . 'includes/utf/data/case_fold_s.' . $phpEx); 918 } 919 920 $text = strtr($text, $uniarray['c']); 921 if ($option === 'full') 922 { 923 $text = strtr($text, $uniarray['f']); 924 } 925 else 926 { 927 $text = strtr($text, $uniarray['s']); 928 } 929 return $text; 930 } 931 932 /** 933 * A wrapper function for the normalizer which takes care of including the class if required and modifies the passed strings 934 * to be in NFC (Normalization Form Composition). 935 * 936 * @param mixed $strings Either an array of references to strings, a reference to an array of strings or a reference to a single string 937 */ 938 function utf8_normalize_nfc($strings) 939 { 940 if (!is_array($strings) || (sizeof($strings) > 0)) 941 { 942 if (!class_exists('utf_normalizer')) 943 { 944 global $phpbb_root_path, $phpEx; 945 include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx); 946 } 947 948 if (is_array($strings)) 949 { 950 foreach ($strings as $key => $string) 951 { 952 $strings[$key] = utf_normalizer::nfc($strings[$key]); 953 } 954 } 955 else 956 { 957 $strings = utf_normalizer::nfc($strings); 958 } 959 } 960 } 961 962 /** 963 * This function is used to generate a "clean" version of a string. 964 * Clean means that it is a case insensitive form (case folding) and that it is normalized (NFC). 965 * Additionally a homographs of one character are transformed into one specific character (preferably ASCII 966 * if it is an ASCII character). 967 * 968 * Please be aware that if you change something within this function or within 969 * functions used here you need to rebuild/update the username_clean column in the users table. And all other 970 * columns that store a clean string otherwise you will break this functionality. 971 * 972 * @param $text An unclean string, mabye user input (has to be valid UTF-8!) 973 * @return Cleaned up version of the input string 974 */ 975 function utf8_clean_string($text) 976 { 977 $text = utf8_case_fold($text); 978 979 if (!class_exists('utf_normalizer')) 980 { 981 global $phpbb_root_path, $phpEx; 982 include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx); 983 } 984 985 $text = utf_normalizer::nfc($text); 986 987 static $homographs = array( 988 // cyrllic 989 "\xD0\xB0" => "\x61", 990 "\xD0\xB5" => "\x65", 991 "\xD0\xBE" => "\x6F", 992 "\xD1\x80" => "\x70", 993 "\xD1\x81" => "\x63", 994 "\xD1\x83" => "\x79", 995 "\xD1\x85" => "\x78", 996 "\xD1\x95" => "\x73", 997 "\xD1\x96" => "\x69", 998 "\xD1\x98" => "\x6A", 999 "\xD2\xBB" => "\x68", 1000 // greek 1001 "\xCE\xB1" => "\x61", 1002 "\xCE\xBF" => "\x6F", 1003 // other 1004 "\xC2\xA1" => "\x69", 1005 ); 1006 1007 $text = strtr($text, $homographs); 1008 1009 return $text; 1010 } 1011 1012 ?>
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Wed Nov 22 00:35:05 2006 | Cross-referenced by PHPXref 0.6 |