PHP
downloads | documentation | faq | getting help | mailing lists | reporting bugs | php.net sites | links | conferences | my php.net

search for in the

xml_error_string> <utf8_decode
Last updated: Sun, 25 Nov 2007

view this page in

utf8_encode

(PHP 4, PHP 5)

utf8_encode — 将 ISO-8859-1 编码的字符串转换为 UTF-8 编码

描述

string utf8_encode ( string $data )

该函数将 data 字符串转换为 UTF-8 编码,并返回编码后的字符串。UTF-8 是一种用于将宽字符值转换为字节流的 Unicode 的标准机制。UTF-8 对于纯 ASCII 字符来说是透明的,且是自同步的(也就是说这使得程序能够得知字符从字节流的何处开始),并可被普通字符串比较函数用以比较等操作。PHP 可将 UTF-8 编码为多达四个字节的字符,如:

UTF-8 编码
字节(bytes) 位(bits) 表 示
1 7 0bbbbbbb
2 11 110bbbbb 10bbbbbb
3 16 1110bbbb 10bbbbbb 10bbbbbb
4 21 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb

每个 UTF-8 表示一个能被用以储存字符数据的位。



xml_error_string> <utf8_decode
Last updated: Sun, 25 Nov 2007
 
add a note add a note User Contributed Notes
utf8_encode
bitseeker
22-Sep-2008 05:07
...or just use this simple piece of code to check valid utf-8 string:

<?php
   
/**
     * Returns true if $string is valid UTF-8 and false otherwise.
     *
     * @since        1.14
     * @param [mixed] $string     string to be tested
     * @subpackage
     */
   
function is_utf8($string) {
      
       
// From http://w3.org/International/questions/qa-forms-utf-8.html
       
return preg_match('%^(?:
              [\x09\x0A\x0D\x20-\x7E]            # ASCII
            | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
            |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
            | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
            |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
            |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
            | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
            |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
        )*$%xs'
, $string);
      
    }
?>
hmdker at gmail dot com
24-Aug-2008 12:49
Here's my is_utf8 function, to detect valid UTF-8 text.

<?php
function is_utf8($str) {
   
$c=0; $b=0;
   
$bits=0;
   
$len=strlen($str);
    for(
$i=0; $i<$len; $i++){
       
$c=ord($str[$i]);
        if(
$c > 128){
            if((
$c >= 254)) return false;
            elseif(
$c >= 252) $bits=6;
            elseif(
$c >= 248) $bits=5;
            elseif(
$c >= 240) $bits=4;
            elseif(
$c >= 224) $bits=3;
            elseif(
$c >= 192) $bits=2;
            else return
false;
            if((
$i+$bits) > $len) return false;
            while(
$bits > 1){
               
$i++;
               
$b=ord($str[$i]);
                if(
$b < 128 || $b > 191) return false;
               
$bits--;
            }
        }
    }
    return
true;
}

?>
akam
30-Jun-2008 10:14
<?php
// Author akam at akameng dot com
// Support 6 bit
function UTF_to_Unicode($input, $array=False) {

 
$bit1  = pow(64, 0);
 
$bit2  = pow(64, 1);
 
$bit3  = pow(64, 2);
 
$bit4  = pow(64, 3);
 
$bit5  = pow(64, 4);
 
$bit6  = pow(64, 5);
 
 
$value = '';
 
$val   = array();
 
 for(
$i=0; $i< strlen( $input ); $i++){
 
    
$ints = ord ( $input[$i] );
    
    
$z     = ord ( $input[$i] );
    
$y     = ord ( $input[$i+1] ) - 128;
    
$x     = ord ( $input[$i+2] ) - 128;
    
$w     = ord ( $input[$i+3] ) - 128;
    
$v     = ord ( $input[$i+4] ) - 128;
    
$u     = ord ( $input[$i+5] ) - 128;

     if(
$ints >= 0 && $ints <= 127 ){
       
// 1 bit
       
$value .= '&#'.($z * $bit1).';';
       
$val[]  = $value;
     }
     if(
$ints >= 192 && $ints <= 223 ){
       
// 2 bit
       
$value .= '&#'.(($z-192) * $bit2 + $y * $bit1).';';
       
$val[]  = $value;
     }   
     if(
$ints >= 224 && $ints <= 239 ){
       
// 3 bit
       
$value .= '&#'.(($z-224) * $bit3 + $y * $bit2 + $x * $bit1).';';
       
$val[]  = $value;
     }    
     if(
$ints >= 240 && $ints <= 247 ){
       
// 4 bit
       
$value .= '&#'.(($z-240) * $bit4 + $y * $bit3 +
$x * $bit2 + $w * $bit1).';';
       
$val[]  = $value;       
     }    
     if(
$ints >= 248 && $ints <= 251 ){
       
// 5 bit
       
$value .= '&#'.(($z-248) * $bit5 + $y * $bit4
+ $x * $bit3 + $w * $bit2 + $v * $bit1).';';
       
$val[]  = $value;  
     }
     if(
$ints == 252 && $ints == 253 ){
       
// 6 bit
       
$value .= '&#'.(($z-252) * $bit6 + $y * $bit5
+ $x * $bit4 + $w * $bit3 + $v * $bit2 + $u * $bit1).';';
       
$val[]  = $value;
     }
     if(
$ints == 254 || $ints == 255 ){
       echo
'Wrong Result!<br>';
     }
    
 }
 
 if(
$array === False ){
    return
$unicode = $value;
 }
 if(
$array === True ){
    
$val     = str_replace('&#', '', $value);
    
$val     = explode(';', $val);
    
$len = count($val);
     unset(
$val[$len-1]);
    
     return
$unicode = $val;
 }
 
}

 
function
Unicode_to_UTF( $input, $array=TRUE){

    
$utf = '';
    if(!
is_array($input)){
      
$input     = str_replace('&#', '', $input);
      
$input     = explode(';', $input);
      
$len = count($input);
       unset(
$input[$len-1]);
    }
    for(
$i=0; $i < count($input); $i++){
   
    if (
$input[$i] <128 ){
      
$byte1 = $input[$i];
      
$utf  .= chr($byte1);
    }
    if (
$input[$i] >=128 && $input[$i] <=2047 ){
   
      
$byte1 = 192 + (int)($input[$i] / 64);
      
$byte2 = 128 + ($input[$i] % 64);
      
$utf  .= chr($byte1).chr($byte2);
    }
    if (
$input[$i] >=2048 && $input[$i] <=65535){
   
      
$byte1 = 224 + (int)($input[$i] / 4096);
      
$byte2 = 128 + ((int)($input[$i] / 64) % 64);
      
$byte3 = 128 + ($input[$i] % 64);
      
      
$utf  .= chr($byte1).chr($byte2).chr($byte3);
    }
    if (
$input[$i] >=65536 && $input[$i] <=2097151){
   
      
$byte1 = 240 + (int)($input[$i] / 262144);
      
$byte2 = 128 + ((int)($input[$i] / 4096) % 64);
      
$byte3 = 128 + ((int)($input[$i] / 64) % 64);
      
$byte4 = 128 + ($input[$i] % 64);
      
$utf  .= chr($byte1).chr($byte2).chr($byte3).
chr($byte4);
    }
    if (
$input[$i] >=2097152 && $input[$i] <=67108863){
   
      
$byte1 = 248 + (int)($input[$i] / 16777216);
      
$byte2 = 128 + ((int)($input[$i] / 262144) % 64);
      
$byte3 = 128 + ((int)($input[$i] / 4096) % 64);
      
$byte4 = 128 + ((int)($input[$i] / 64) % 64);
      
$byte5 = 128 + ($input[$i] % 64);
      
$utf  .= chr($byte1).chr($byte2).chr($byte3).
chr($byte4).chr($byte5);
    }
    if (
$input[$i] >=67108864 && $input[$i] <=2147483647){
   
      
$byte1 = 252 + ($input[$i] / 1073741824);
      
$byte2 = 128 + (($input[$i] / 16777216) % 64);
      
$byte3 = 128 + (($input[$i] / 262144) % 64);
      
$byte4 = 128 + (($input[$i] / 4096) % 64);
      
$byte5 = 128 + (($input[$i] / 64) % 64);
      
$byte6 = 128 + ($input[$i] % 64);
      
$utf  .= chr($byte1).chr($byte2).chr($byte3).
chr($byte4).chr($byte5).chr($byte6);
    }
   }
   return
$utf;
}
?>
www.tricinty.com
11-Jun-2008 06:43
<?php
   
/**
    * Encodes an ISO-8859-1 mixed variable to UTF-8 (PHP 4, PHP 5 compat)
    * @param    mixed    $input An array, associative or simple
    * @param    boolean  $encode_keys optional
    * @return    mixed     ( utf-8 encoded $input)
    */

   
function utf8_encode_mix($input, $encode_keys=false)
    {
        if(
is_array($input))
        {
           
$result = array();
            foreach(
$input as $k => $v)
            {               
               
$key = ($encode_keys)? utf8_encode($k) : $k;
               
$result[$key] = utf8_encode_mix( $v, $encode_keys);
            }
        }
        else
        {
           
$result = utf8_encode($input);
        }

        return
$result;
    }
?>
klein at buchung-24 dot de
04-Jun-2008 07:22
IF you don´t use the function from ethan dot nelson at ltd dot org in a class, you´ll get an error, so please try

function utf_prepare(&$array)
{
    foreach($array AS $key => &$value)
    {
        if (is_array($value))
        {
            utf_prepare($value);
        } else
        {
            $value = utf8_encode($value);
        }
    }
}
www.qaiser.net
17-Apr-2008 10:56
that isUTF8 function is a killer...

wouldn't something like

if ( preg_match( "~(\x00[\x80-\xff]|[\x00-\x07][\x00-\xff]~", $string ) ) { /* is utf */ };

be a lot more efficient? it doesn't take into account all the ranges, but it has to be a better method and a simple start since it'll quit on the first successful match. think of encoding and decoding a 1mb string--not good. i'm having to work with +20 meg xml files.
renardo13 at free dot fr
01-Apr-2008 08:56
another nice way to implement an isUTF8 function ...

<?php

function isUTF8($string)
{
    return (
utf8_encode(utf8_decode($string)) == $string);
}

?>
tacchete at gmail dot com
13-Dec-2007 08:35
Known problem with Byte Order Mark (BOM) and header() in pages of a site.

For example at sending headings or to a dynamic conclusion in other coding distinct from UTF-8 by means of XSLT (<xsl:output encoding="windows-1251"/>).

To clean all symbols BOM from the text of page:

1. exclude BOM from the main file;
2. write down function of a return call for the buffer

<?php
header
('content-type: text/html; charset: utf-8');
ob_start('ob');
function
ob($buffer)
{
    return
str_replace("\xef\xbb\xbf", '', $buffer);
}
?>

it will exclude BOM from a code of the connected files;
3. do not experience for BOM in connected files;
4. be pleased.
ethan dot nelson at ltd dot org
07-Nov-2007 09:41
This does the same thing as some of the posts below (minus the keys), but I thought I'd share anyway cause it is slightly more elegant.  Also, its a good example using references such that this could be used as a callback function.

  function utf_prepare(&$array) {

    foreach($array AS $key => &$value) {

      if (is_array($value)) {
        $this->utf_prepare($value);
      } else {
        $value = utf8_encode($value);
      }

    }

  }
luka8088 at gmail dot com
22-Jun-2007 10:19
simple HTML to UTF-8 conversion:

function html_to_utf8 ($data)
    {
    return preg_replace("/\\&\\#([0-9]{3,10})\\;/e", '_html_to_utf8("\\1")', $data);
    }

function _html_to_utf8 ($data)
    {
    if ($data > 127)
        {
        $i = 5;
        while (($i--) > 0)
            {
            if ($data != ($a = $data % ($p = pow(64, $i))))
                {
                $ret = chr(base_convert(str_pad(str_repeat(1, $i + 1), 8, "0"), 2, 10) + (($data - $a) / $p));
                for ($i; $i > 0; $i--)
                    $ret .= chr(128 + ((($data % pow(64, $i)) - ($data % ($p = pow(64, $i - 1)))) / $p));
                break;
                }
            }
        }
        else
        $ret = "&#$data;";
    return $ret;
    }

Example:
echo html_to_utf8("a b &#269; &#263; &#382; &#12371; &#12395; &#12385; &#12431; ()[]{}!#$?* &lt; &#62;");

Output:
a b č ć ž こ に ち わ ()[]{}!#$?* &lt; &#62;
hillar dot petersen at gmail dot com
30-May-2007 01:59
In addition to my previous post. If your values are already in utf-8 maybe you want to utf8_encode array keys only. This will do it:

<?php
/**
 * (Recursively) utf8_encode all array keys.
 *
 * @param array $array
 * @return array with utf8_encoded keys
 */

function utf8_encode_array_keys($array)
{
 
$array_type = array_type($array);

  if (
$array_type == "map")
  {
   
$result_array = array();

    foreach(
$array as $key => $value)
    {
      if (
is_array($value))
      {
       
// recursion
       
$result_array[utf8_encode($key)] = utf8_encode_array_keys($value);
      }
      else
      {
       
// value is not an array, no recursion
       
$result_array[utf8_encode($key)] = $value;
      }
    }
   
    return
$result_array;
  }

  else if (
$array_type == "vector")
  {
   
// do not encode anything, just follow the value if it is an array
   
$result_array = array();
   
    foreach (
$array as $key => $value)
    {
      if (
is_array($value))
      {
       
// recursion
       
$result_array[$key] = utf8_encode_array_keys($value);
      }
      else
      {
       
// value is not an array, no recursion
       
$result_array[$key] = $value;
      }
    }
   
    return
$result_array;
  }

  return
false;     // argument is not an array, return false
}
?>

Also note that both this operation (with keys only) and the operation with both keys and values can be reversed by replacing "encode" by "decode".
hillar dot petersen at gmail dot com
29-May-2007 10:06
If you are interested in recursively converting ISO-8859-1-encoded arrays into UTF-8, then this is one way to do it. Could use a small refactor though. (I used it to prepare some ISO-8859-1 arrays for json_encode. Note that for this to work your values and for associative arrays also your keys must be ISO-8859-1-encoded.)

<?php
/**
 * (Recursively) utf8_encode each value in an array.
 *
 * @param array $array
 * @return array utf8_encoded
 */

function utf8_encode_array($array)
{
  if (
is_array($array))
  {
   
$result_array = array();

    foreach(
$array as $key => $value)
    {

      if (
array_type($array) == "map")
      {
       
// encode both key and value

       
if (is_array($value))
        {
         
// recursion
         
$result_array[utf8_encode($key)] = utf8_encode_array($value);
        }
        else
        {
         
// no recursion
         
if (is_string($value))
          {
           
$result_array[utf8_encode($key)] = utf8_encode($value);
          }
          else
          {
           
// do not re-encode non-strings, just copy data
           
$result_array[utf8_encode($key)] = $value;
          }

        }

      }

      else if (
array_type($array) == "vector")
      {
       
// encode value only
       
       
if (is_array($value))
        {
         
// recursion
         
$result_array[$key] = utf8_encode_array($value);
        }
        else
        {
         
// no recursion
         
         
if (is_string($value))
          {
           
$result_array[$key] = utf8_encode($value);
          }
          else
          {
           
// do not re-encode non-strings, just copy data
           
$result_array[$key] = $value;
          }

        }

      }

    }

    return
$result_array;
  }

  return
false;     // argument is not an array, return false
}

/**
 * Determines array type ("vector" or "map"). Returns false if not an array at all.
 * (I hope a native function will be introduced in some future release of PHP, because
 * this check is inefficient and quite costly in worst case scenario.)
 *
 * @param array $array The array to analyze
 * @return string array type ("vector" or "map") or false if not an array
 */

function array_type($array)
{
  if (
is_array($array))
  {
   
$next = 0;

   
$return_value = "vector"// we have a vector until proved otherwise

   
foreach ($array as $key => $value)
    {

      if (
$key != $next)
      {
       
$return_value = "map"// we have a map
       
break;
      }

     
$next++;
    }
   
    return
$return_value;
  }

  return
false;    // not array
}
?>
nikooo adog bk adot ru - Nickolaz
03-May-2007 10:02
You can use this simple code to convert win-1251 into Unicode.

    function rus2uni($str,$isTo = true)
    {
        $arr = array('ё'=>'&#x451;','Ё'=>'&#x401;');
        for($i=192;$i<256;$i++)
            $arr[chr($i)] = '&#x4'.dechex($i-176).';';
        $str =preg_replace(array('@([а-я]) @i','@ ([а-я])@i'),array('$1&#x0a0;','&#x0a0;$1'),$str);
        return strtr($str,$isTo?$arr:array_flip($arr));
    }

That is useful for xml_parser (to parse windows-1251 files like utf-8).
19-Apr-2007 12:06
I just read what I wrote, sorry for the typos it was a long day:

here's the rewritten code:

xml_tpl.php
<?php
    header
("Content-Type: text/html;charset=ISO-8859-1");
    print
"<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n";
   
$names=array('jack','bob','vanessa','catherine','valerie');
?>
<parent>
<?php foreach($names as $name) {?>
    <child name="<?php print $name?>" />
<?php } ?>
</parent>

<?php
function create_xml(){
   
ob_start();
    include
"xml_tpl.php";
   
$trapped_content=ob_get_contents();
   
ob_end_clean();
   
$file_path= "./somefile.xml";
   
$file_handle=fopen($file_path,'w');
   
fwrite($file_handle,utf8_encode($trapped_content));
}

?>
penda ekoka
18-Apr-2007 02:15
creating utf-8 xml files:
this is something that has wasted a lot of my time, I hope this will spare you the headaches:

my method consists of creating an xml template that will look like this (this is probably optional, I'm sure you can use good ol' print or echo statements):

xml_tpl.php
<?php
header
("Content-Type: text/html;charset=ISO-8859-1");
print
"<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n";
$names=array('jack','bob','vanessa','catherine','valerie');
?>
<parent>
<?php foreach($names as $name) {?>
    <child name="<?php print $name?>" />
<?php } ?>
</parent>
?>

from a function or a method I include the previous template and trap the outputted content in an output buffer. The buffured content is then inserted into a file:

<?php
function create_xml(){
   
ob_start();
    include
"xml_php.php";
   
$trapped_content=ob_get_contents();
   
ob_end_clean();
   
$file_path= "./somefile.xml";
   
$file_handle=fopen($somefile,'w');
   
fwrite($file_handle,utf8_encode($trapped_content));
}

?>

Some side notes:
- note that the utf8_encode function goes inside the fwrite() function.
- when troubleshooting, make sure to transfer text file (xml included) and scripts in ascii mode when using ftp. For some unknown reason my ftp client did not have xml set as an ascii transfer candidate and was automatically tranfering them in binary. That little "feature" ended up costing me hours of frustration, as the encoding information would just "vanish" between transfer and I kept scratching my head as to why manually created utf8 files were not behaving as they should.
29-Mar-2007 06:07
<?php

function unicon($str, $to_uni = true) {
   
$cp = Array (
       
"А" => "&#x410;", "а" => "&#x430;",
       
"Б" => "&#x411;", "б" => "&#x431;",
       
"В" => "&#x412;", "в" => "&#x432;",
       
"Г" => "&#x413;", "г" => "&#x433;",
       
"Д" => "&#x414;", "д" => "&#x434;",
       
"Е" => "&#x415;", "е" => "&#x435;",
       
"Ё" => "&#x401;", "ё" => "&#x451;",
       
"Ж" => "&#x416;", "ж" => "&#x436;",
       
"З" => "&#x417;", "з" => "&#x437;",
       
"И" => "&#x418;", "и" => "&#x438;",
       
"Й" => "&#x419;", "й" => "&#x439;",
       
"К" => "&#x41A;", "к" => "&#x43A;",
       
"Л" => "&#x41B;", "л" => "&#x43B;",
       
"М" => "&#x41C;", "м" => "&#x43C;",
       
"Н" => "&#x41D;", "н" => "&#x43D;",
       
"О" => "&#x41E;", "о" => "&#x43E;",
       
"П" => "&#x41F;", "п" => "&#x43F;",
       
"Р" => "&#x420;", "р" => "&#x440;",
       
"С" => "&#x421;", "с" => "&#x441;",
       
"Т" => "&#x422;", "т" => "&#x442;",
       
"У" => "&#x423;", "у" => "&#x443;",
       
"Ф" => "&#x424;", "ф" => "&#x444;",
       
"Х" => "&#x425;", "х" => "&#x445;",
       
"Ц" => "&#x426;", "ц" => "&#x446;",
       
"Ч" => "&#x427;", "ч" => "&#x447;",
       
"Ш" => "&#x428;", "ш" => "&#x448;",
       
"Щ" => "&#x429;", "щ" => "&#x449;",
       
"Ъ" => "&#x42A;", "ъ" => "&#x44A;",
       
"Ы" => "&#x42B;", "ы" => "&#x44B;",
       
"Ь" => "&#x42C;", "ь" => "&#x44C;",
       
"Э" => "&#x42D;", "э" => "&#x44D;",
       
"Ю" => "&#x42E;", "ю" => "&#x44E;",
       
"Я" => "&#x42F;", "я" => "&#x44F;"
   
);
   
    if (
$to_uni) {
       
$str = strtr($str, $cp);
    } else {
        foreach (
$cp as $c) {
           
$cpp[$c] = array_search($c, $cp);
        }
       
$str = strtr($str, $cpp);
    }
   
    return
$str;
}

?>
emze at donazga dot net
18-Dec-2006 01:42
/*
Every function seen so far is incomplete or resource consumpting. Here are two -- integer 2 utf sequence (i3u) and utf sequence to integer (u3i). Below is a code snippet that checks well behavior at the range boundaries.

Someday they might be hardcoded into PHP...
*/

function i3u($i) { // returns UCS-16 or UCS-32 to UTF-8 from an integer
  $i=(int)$i; // integer?
  if ($i<0) return false; // positive?
  if ($i<=0x7f) return chr($i); // range 0
  if (($i & 0x7fffffff) <> $i) return '?'; // 31 bit?
  if ($i<=0x7ff) return chr(0xc0 | ($i >> 6)) . chr(0x80 | ($i & 0x3f));
  if ($i<=0xffff) return chr(0xe0 | ($i >> 12)) . chr(0x80 | ($i >> 6) & 0x3f)
      . chr(0x80  | $i & 0x3f);
  if ($i<=0x1fffff) return chr(0xf0 | ($i >> 18)) . chr(0x80 | ($i >> 12) & 0x3f)
      . chr(0x80 | ($i >> 6) & 0x3f) . chr(0x80  | $i & 0x3f);
  if ($i<=0x3ffffff) return chr(0xf8 | ($i >> 24)) . chr(0x80 | ($i >> 18) & 0x3f)
      . chr(0x80 | ($i >> 12) & 0x3f) . chr(0x80 | ($i >> 6) & 0x3f) . chr(0x80  | $i & 0x3f);
  return chr(0xfc | ($i >> 30)) . chr(0x80 | ($i >> 24) & 0x3f) . chr(0x80 | ($i >> 18) & 0x3f)
      . chr(0x80 | ($i >> 12) & 0x3f) . chr(0x80 | ($i >> 6) & 0x3f) . chr(0x80  | $i & 0x3f);
}

function u3i($s,$strict=1) { // returns integer on valid UTF-8 seq, NULL on empty, else FALSE
  // NOT strict: takes only DATA bits, present or not; strict: length and bits checking
  if ($s=='') return NULL;
  $l=strlen($s); $o=ord($s{0});
  if ($o <= 0x7f && $l==1) return $o;
  if ($l>6 && $strict) return false;
  if ($strict) for ($i=1;$i<$l;$i++) if (ord($s{$i}) > 0xbf || ord($s{$i})< 0x80) return false;
  if ($o < 0xc2) return false; // no-go even if strict=0
  if ($o <= 0xdf && ($l=2 && $strict)) return (($o & 0x1f) << 6 | (ord($s{1}) & 0x3f));
  if ($o <= 0xef && ($l=3 && $strict)) return (($o & 0x0f) << 12 | (ord($s{1}) & 0x3f) << 6
     |  (ord($s{2}) & 0x3f));
  if ($o <= 0xf7 && ($l=4 && $strict)) return (($o & 0x07) << 18 | (ord($s{1}) & 0x3f) << 12
     | (ord($s{2}) & 0x3f) << 6 |  (ord($s{3}) & 0x3f));
  if ($o <= 0xfb && ($l=5 && $strict)) return (($o & 0x03) << 24 | (ord($s{1}) & 0x3f) << 18
     | (ord($s{2}) & 0x3f) << 12 | (ord($s{3}) & 0x3f) << 6 |  (ord($s{4}) & 0x3f));
  if ($o <= 0xfd && ($l=6 && $strict)) return (($o & 0x01) << 30 | (ord($s{1}) & 0x3f) << 24
     | (ord($s{2}) & 0x3f) << 18 | (ord($s{3}) & 0x3f) << 12
     | (ord($s{4}) & 0x3f) << 6 |  (ord($s{5}) & 0x3f));
  return false;
}

// boundary behavior checking
$do=array(0x7f,0x7ff,0xffff,0x1fffff,0x3ffffff,0x7fffffff);
foreach ($do as $ii) for ($i=$ii;$i<=$ii+1; $i++) {
  $o=i3u($i);
  for ($j=0;$j<strlen($o);$j++) print "O[$j]=" . sprintf('%08b',ord($o{$j})) . ", ";
  print "c=$i, o=[$o].\n";
  print "Back: [$o] => [" . u3i($o) . "]\n";
}
sadikkeskin at hotmail dot com
21-Nov-2006 06:49
i wrote a function to convert encoding utf8 to iso-8859-9. This function is very useful if you want to use this for ajax.
you can apply same way for other languages.
<?
function str_encode ($string,$to="iso-8859-9",$from="utf8") {
    if(
$to=="iso-8859-9" && $from=="utf8"){
       
$str_array = array(
      
chr(196).chr(177) => chr(253),
      
chr(196).chr(176) => chr(221),
      
chr(195).chr(182) => chr(246),
      
chr(195).chr(150) => chr(214),
      
chr(195).chr(167) => chr(231),
      
chr(195).chr(135) => chr(199),
      
chr(197).chr(159) => chr(254),
      
chr(197).chr(158) => chr(222),
      
chr(196).chr(159) => chr(240),
      
chr(196).chr(158) => chr(208),
      
chr(195).chr(188) => chr(252),
      
chr(195).chr(156) => chr(220)
       );
       return
str_replace(array_keys($str_array), array_values($str_array), $string);
   
    }   
    return
$string;
}
?>
genert at adsuk dot com
02-Oct-2006 01:23
If you encoded data with utf8_encode function and you would like to decode it in javascript use library found here: http://www.webtoolkit.info/. There is encoder too.
28-Sep-2006 04:30
In reply to Cundle:

Note: The BOM is completely unnecessary in UTF-8. UTF-8 is interpreted the same way regardless of endianness, e.g. Λ (U+039B, GREEK CAPITAL LETTER LAMDA) is represented as the octets 0xCE, 0x9B, always in that order.

Extra note: UTF-16 and UCS-2 are different. The same letter would be encoded as 0x03 0x9B on big-endian (e.g. Motorola) architecture, but 0x9B 0x03 on little-endian (e.g Intel) architecture.

But in any case, there's nothing wrong with putting a BOM at the beginning of a UTF-8 encoded file. It is just treated as U+FEFF Zero Width No-Break Space.
James Cundle
18-Jul-2006 10:33
I had some difficulty finding a way to easily write UTF-8 files with the byte order mark included. This is the simple solution I have come up with:

<?php
function writeUTF8File($filename,$content) {
       
$dhandle=fopen($filename,"w");
       
# Now UTF-8 - Add byte order mark
       
fwrite($dhandle, pack("CCC",0xef,0xbb,0xbf));
       
fwrite($dhandle,$content);
       
fclose($dhandle);
}
?>

When you read the file back in using fopen, the BOM will also be there. To remove it, I also wrote the following function:

<?php
function removeBOM($str=""){
        if(
substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {
               
$str=substr($str, 3);
        }
        return
$str;
}
?>
rocketman
16-Mar-2006 08:46
If you are looking for a function to replace special characters with the hex-utf-8 value (e.g. für Webservice-Security/WSS4J compliancy) you might use this:

$textstart = "Größe";
$utf8 ='';
$max = strlen($txt);

for ($i = 0; $i < $max; $i++) {

if ($txt{i} == "&"){
$neu = "&x26;";
}
elseif ((ord($txt{$i}) < 32) or (ord($txt{$i}) > 127)){
$neu = urlencode(utf8_encode($txt{$i}));
$neu = preg_replace('#\%(..)\%(..)\%(..)#','&#x\1;&#x\2;&#x\3;',$neu);
$neu = preg_replace('#\%(..)\%(..)#','&#x\1;&#x\2;',$neu);
$neu = preg_replace('#\%(..)#','&#x\1;',$neu);
}
else {
$neu = $txt{$i};
}
       
$utf8 .= $neu;
} // for $i

$textnew = $utf8;

In this example $textnew will be "Gr&#xC3;&#xB6;&#xC3;&#x9F;e"
mailing at jcn50 dot com
21-Jan-2006 02:40
I recommend using this alternative for every language:

$new=mb_convert_encoding($s,"UTF-8","auto");

Don't forget to set all your pages to "utf-8" encoding, otherwise just use HTML entities.

jcn50.
migueldiaz at gennio dot com
14-Dec-2005 01:23
Here's my function to know if one string is encoded in UTF8.

If we encode in UTF8 a string or text file that is already encoded in UTF8, it's expected to find the character '' ( ALT+159)  in the final string.

<?php

function isUTF8($string)
{
   
$string_utf8 = utf8_encode($string);
    if(
strpos($string_utf8,"",0) !== false ) // "" is ALT+159
        
return true// the original string was utf8
   
else
         return
false; // otherwise
}

?>

regards
Miguel Daz
05-Nov-2005 06:34
// Reads a file story.txt ascii (as typed on keyboard)
// converts it to Georgian character using utf8 encoding
// if I am correct(?) just as it should be when typed on Georgian computer
// it outputs it as an html file
//
// http://www.comweb.nl/keys_to_georgian.html
// http://www.comweb.nl/keys_to_georgian.php
// http://www.comweb.nl/story.txt

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

<HTML>
<HEAD>
<TITLE>keys to unicode code</TITLE>

// this meta tag is needed
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" >

// note the sylfean font seems to be standard installed on Windows XP
// It supports Georgian
 
<style TYPE="text/css">
<!--
body {font-family:sylfaen; }
-->
</style>
</HEAD>

<BODY>

<?
$eng
=array(97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
112,113,114,115,116,117,118,119,120,121,122,87,82,84,83,
67,74,90);
$geo=array(4304,4305,4330,4307,4308,4324,4306,4336,4312,4335,4313,
4314,4315,4316,4317,4318,4325,4320,4321,4322,4323,4309,
4332,4334,4327,4310,4333,4326,4311,4328,4329,4319,4331,
91,93,59,39,44,46,96);

$fc=file("story.txt");
foreach(
$fc as $line)
{
  
$spacestart=1;
   for (
$i=0; $i<strlen($line); $i+=1)
   {
     
$character=ord(substr($line,$i,1));
     
$found=0;
      for (
$k=0; $k<count($eng); $k+=1)
      {
         if (
$eng[$k]==$character)
         {
             print
code2utf( $geo[$k] );
            
$found=1;
         }
      }
      if (
$found==0)
      {
         if (
$character==126 || $character==32 || $character==10 || $character==9)
         {
            if (
$character==9)  { print '&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'; }
            if (
$character