lawrence k wrote:
What PHP code would give me this kind of 100% certainty?
I was bored so wrote this. I’m quite proud of myself, as I wrote it and ran it and it worked first time! 🙂
It not only checks that the UTF-8 is valid, it forces it to be valid.
* @copyright Copyright (C) 2007 Toby Inkster
* @license http://www.gnu.org/copyleft/lgpl.html GNU Lesser General Public Licence
*/
/**
* Utlity function to retrieve the first byte from a string.
*
* Note this function has a side-effect. As well as returning the
* first byte of the string, it also modifies the string passed
* as a parameter to remove the initial byte.
*
* @param string $string String to shift.
* @return string First byte of string.
*/
function shift_byte (&$string)
{
if (strlen($string)<1)
return FALSE;
$byte = substr($string, 0, 1);
$string = substr($string, 1);
return $byte;
}
/**
* Validate a string as UTF-8, and modify the string to remove nasties.
*
* Note this function has a side-effect. As well as returning a
* boolean to indicate whether the given string was valid, it also
* modifies the string replacing any invalid characters with a
* replacement character. (The replacement character is a question
* mark, but you can change this if you like.)
*
* Note that in UTF-8, most characters have several alternative
* representations. RFC 3629 says that the shortest representation
* is the correct one. Other representations ("overlong forms")
* are not valid. Earlier UTF-8 specifications did not prohibit
* overlong forms, though suggest emitting a warning when one is
* encountered. This function DOES NOT CHECK FOR OVERLONG FORMS!
*
* @param string $string String to validate.
* @return boolean Was the string valid or not?
*/
function validate_utf8 (&$string)
{
$new = '';
$valid = TRUE;
$replacement = '?';
/* Loop through each UTF-8 character. */
while (strlen($string))
{
/* Array of bytes to store this character. */
$c = array();
/* Firstly, assume that a character is a single byte. */
$c[0] = shift_byte($string);
/* "Seven Z" notation. */
if (ord($c[0]) <= 0x7F)
{
$new .= $c[0];
}
/* "Five Y, Six Z" notation. */
elseif ((ord($c[0]) >= 0xC2) && (ord($c[0]) <= 0xDF))
{
$c[1] = shift_byte($string);
if ((ord($c[1]) >= 0x80) && (ord($c[1]) <= 0xBF))
{
$new .= $c[0].$c[1];
}
else
{
$new .= $replacement;
$valid = FALSE;
}
}
/* "Four X, Six Y, Six Z" notation. */
elseif ((ord($c[0]) >= 0xE0) && (ord($c[0]) <= 0xEF))
{
$c[1] = shift_byte($string);
$c[2] = shift_byte($string);
if ((ord($c[1]) >= 0x80) && (ord($c[1]) <= 0xBF)
&& (ord($c[2]) >= 0x80) && (ord($c[2]) <= 0xBF))
{
$new .= $c[0].$c[1].$c[2];
}
else
{
$new .= $replacement;
$valid = FALSE;
}
}
/* "Three W, Six X, Six Y, Six Z" notation. */
elseif ((ord($c[0]) >= 0xF0) && (ord($c[0]) <= 0xF4))
{
$c[1] = shift_byte($string);
$c[2] = shift_byte($string);
$c[3] = shift_byte($string);
if ((ord($c[1]) >= 0x80) && (ord($c[1]) <= 0xBF)
&& (ord($c[2]) >= 0x80) && (ord($c[2]) <= 0xBF)
&& (ord($c[3]) >= 0x80) && (ord($c[3]) <= 0xBF))
{
$new .= $c[0].$c[1].$c[2].$c[3];
}
else
{
$new .= $replacement;
$valid = FALSE;
}
}
else
{
$new .= $replacement;
$valid = FALSE;
}
}
$string = $new;
return $valid;
}
1?>