sforkowany z mirror/friendica
969 wiersze
38 KiB
PHP
969 wiersze
38 KiB
PHP
<?php
|
|
// {{{ license
|
|
|
|
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
|
|
//
|
|
// +----------------------------------------------------------------------+
|
|
// | This library is free software; you can redistribute it and/or modify |
|
|
// | it under the terms of the GNU Lesser General Public License as |
|
|
// | published by the Free Software Foundation; either version 2.1 of the |
|
|
// | License, or (at your option) any later version. |
|
|
// | |
|
|
// | This library is distributed in the hope that it will be useful, but |
|
|
// | WITHOUT ANY WARRANTY; without even the implied warranty of |
|
|
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
|
// | Lesser General Public License for more details. |
|
|
// | |
|
|
// | You should have received a copy of the GNU Lesser General Public |
|
|
// | License along with this library; if not, write to the Free Software |
|
|
// | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
|
|
// | USA. |
|
|
// +----------------------------------------------------------------------+
|
|
//
|
|
|
|
// }}}
|
|
|
|
/**
|
|
* Encode/decode Internationalized Domain Names.
|
|
*
|
|
* The class allows to convert internationalized domain names
|
|
* (see RFC 3490 for details) as they can be used with various registries worldwide
|
|
* to be translated between their original (localized) form and their encoded form
|
|
* as it will be used in the DNS (Domain Name System).
|
|
*
|
|
* The class provides two public methods, encode() and decode(), which do exactly
|
|
* what you would expect them to do. You are allowed to use complete domain names,
|
|
* simple strings and complete email addresses as well. That means, that you might
|
|
* use any of the following notations:
|
|
*
|
|
* - www.nörgler.com
|
|
* - xn--nrgler-wxa
|
|
* - xn--brse-5qa.xn--knrz-1ra.info
|
|
*
|
|
* Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4
|
|
* array. Unicode output is available in the same formats.
|
|
* You can select your preferred format via {@link set_paramter()}.
|
|
*
|
|
* ACE input and output is always expected to be ASCII.
|
|
*
|
|
* @author Matthias Sommerfeld <mso@phlylabs.de>
|
|
* @copyright 2004-2007 phlyLabs Berlin, http://phlylabs.de
|
|
* @version 0.5.1
|
|
*
|
|
*/
|
|
class idna_convert
|
|
{
|
|
/**
|
|
* Holds all relevant mapping tables, loaded from a seperate file on construct
|
|
* See RFC3454 for details
|
|
*
|
|
* @var array
|
|
* @access private
|
|
*/
|
|
var $NP = array();
|
|
|
|
// Internal settings, do not mess with them
|
|
var $_punycode_prefix = 'xn--';
|
|
var $_invalid_ucs = 0x80000000;
|
|
var $_max_ucs = 0x10FFFF;
|
|
var $_base = 36;
|
|
var $_tmin = 1;
|
|
var $_tmax = 26;
|
|
var $_skew = 38;
|
|
var $_damp = 700;
|
|
var $_initial_bias = 72;
|
|
var $_initial_n = 0x80;
|
|
var $_sbase = 0xAC00;
|
|
var $_lbase = 0x1100;
|
|
var $_vbase = 0x1161;
|
|
var $_tbase = 0x11A7;
|
|
var $_lcount = 19;
|
|
var $_vcount = 21;
|
|
var $_tcount = 28;
|
|
var $_ncount = 588; // _vcount * _tcount
|
|
var $_scount = 11172; // _lcount * _tcount * _vcount
|
|
var $_error = false;
|
|
|
|
// See {@link set_paramter()} for details of how to change the following
|
|
// settings from within your script / application
|
|
var $_api_encoding = 'utf8'; // Default input charset is UTF-8
|
|
var $_allow_overlong = false; // Overlong UTF-8 encodings are forbidden
|
|
var $_strict_mode = false; // Behave strict or not
|
|
|
|
// The constructor
|
|
function idna_convert($options = false)
|
|
{
|
|
$this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount;
|
|
if (function_exists('file_get_contents')) {
|
|
$this->NP = unserialize(file_get_contents(dirname(__FILE__).'/npdata.ser'));
|
|
} else {
|
|
$this->NP = unserialize(join('', file(dirname(__FILE__).'/npdata.ser')));
|
|
}
|
|
// If parameters are given, pass these to the respective method
|
|
if (is_array($options)) {
|
|
return $this->set_parameter($options);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Sets a new option value. Available options and values:
|
|
* [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
|
|
* 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
|
|
* [overlong - Unicode does not allow unnecessarily long encodings of chars,
|
|
* to allow this, set this parameter to true, else to false;
|
|
* default is false.]
|
|
* [strict - true: strict mode, good for registration purposes - Causes errors
|
|
* on failures; false: loose mode, ideal for "wildlife" applications
|
|
* by silently ignoring errors and returning the original input instead
|
|
*
|
|
* @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
|
|
* @param string Value to use (if parameter 1 is a string)
|
|
* @return boolean true on success, false otherwise
|
|
* @access public
|
|
*/
|
|
function set_parameter($option, $value = false)
|
|
{
|
|
if (!is_array($option)) {
|
|
$option = array($option => $value);
|
|
}
|
|
foreach ($option as $k => $v) {
|
|
switch ($k) {
|
|
case 'encoding':
|
|
switch ($v) {
|
|
case 'utf8':
|
|
case 'ucs4_string':
|
|
case 'ucs4_array':
|
|
$this->_api_encoding = $v;
|
|
break;
|
|
default:
|
|
$this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k);
|
|
return false;
|
|
}
|
|
break;
|
|
case 'overlong':
|
|
$this->_allow_overlong = ($v) ? true : false;
|
|
break;
|
|
case 'strict':
|
|
$this->_strict_mode = ($v) ? true : false;
|
|
break;
|
|
default:
|
|
$this->_error('Set Parameter: Unknown option '.$k);
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Decode a given ACE domain name
|
|
* @param string Domain name (ACE string)
|
|
* [@param string Desired output encoding, see {@link set_parameter}]
|
|
* @return string Decoded Domain name (UTF-8 or UCS-4)
|
|
* @access public
|
|
*/
|
|
function decode($input, $one_time_encoding = false)
|
|
{
|
|
// Optionally set
|
|
if ($one_time_encoding) {
|
|
switch ($one_time_encoding) {
|
|
case 'utf8':
|
|
case 'ucs4_string':
|
|
case 'ucs4_array':
|
|
break;
|
|
default:
|
|
$this->_error('Unknown encoding '.$one_time_encoding);
|
|
return false;
|
|
}
|
|
}
|
|
// Make sure to drop any newline characters around
|
|
$input = trim($input);
|
|
|
|
// Negotiate input and try to determine, whether it is a plain string,
|
|
// an email address or something like a complete URL
|
|
if (strpos($input, '@')) { // Maybe it is an email address
|
|
// No no in strict mode
|
|
if ($this->_strict_mode) {
|
|
$this->_error('Only simple domain name parts can be handled in strict mode');
|
|
return false;
|
|
}
|
|
list ($email_pref, $input) = explode('@', $input, 2);
|
|
$arr = explode('.', $input);
|
|
foreach ($arr as $k => $v) {
|
|
if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
|
|
$conv = $this->_decode($v);
|
|
if ($conv) $arr[$k] = $conv;
|
|
}
|
|
}
|
|
$input = join('.', $arr);
|
|
$arr = explode('.', $email_pref);
|
|
foreach ($arr as $k => $v) {
|
|
if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
|
|
$conv = $this->_decode($v);
|
|
if ($conv) $arr[$k] = $conv;
|
|
}
|
|
}
|
|
$email_pref = join('.', $arr);
|
|
$return = $email_pref . '@' . $input;
|
|
} elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
|
|
// No no in strict mode
|
|
if ($this->_strict_mode) {
|
|
$this->_error('Only simple domain name parts can be handled in strict mode');
|
|
return false;
|
|
}
|
|
$parsed = parse_url($input);
|
|
if (isset($parsed['host'])) {
|
|
$arr = explode('.', $parsed['host']);
|
|
foreach ($arr as $k => $v) {
|
|
$conv = $this->_decode($v);
|
|
if ($conv) $arr[$k] = $conv;
|
|
}
|
|
$parsed['host'] = join('.', $arr);
|
|
$return =
|
|
(empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://'))
|
|
.(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@')
|
|
.$parsed['host']
|
|
.(empty($parsed['port']) ? '' : ':'.$parsed['port'])
|
|
.(empty($parsed['path']) ? '' : $parsed['path'])
|
|
.(empty($parsed['query']) ? '' : '?'.$parsed['query'])
|
|
.(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']);
|
|
} else { // parse_url seems to have failed, try without it
|
|
$arr = explode('.', $input);
|
|
foreach ($arr as $k => $v) {
|
|
$conv = $this->_decode($v);
|
|
$arr[$k] = ($conv) ? $conv : $v;
|
|
}
|
|
$return = join('.', $arr);
|
|
}
|
|
} else { // Otherwise we consider it being a pure domain name string
|
|
$return = $this->_decode($input);
|
|
if (!$return) $return = $input;
|
|
}
|
|
// The output is UTF-8 by default, other output formats need conversion here
|
|
// If one time encoding is given, use this, else the objects property
|
|
switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
|
|
case 'utf8':
|
|
return $return;
|
|
break;
|
|
case 'ucs4_string':
|
|
return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return));
|
|
break;
|
|
case 'ucs4_array':
|
|
return $this->_utf8_to_ucs4($return);
|
|
break;
|
|
default:
|
|
$this->_error('Unsupported output format');
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Encode a given UTF-8 domain name
|
|
* @param string Domain name (UTF-8 or UCS-4)
|
|
* [@param string Desired input encoding, see {@link set_parameter}]
|
|
* @return string Encoded Domain name (ACE string)
|
|
* @access public
|
|
*/
|
|
function encode($decoded, $one_time_encoding = false)
|
|
{
|
|
// Forcing conversion of input to UCS4 array
|
|
// If one time encoding is given, use this, else the objects property
|
|
switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) {
|
|
case 'utf8':
|
|
$decoded = $this->_utf8_to_ucs4($decoded);
|
|
break;
|
|
case 'ucs4_string':
|
|
$decoded = $this->_ucs4_string_to_ucs4($decoded);
|
|
case 'ucs4_array':
|
|
break;
|
|
default:
|
|
$this->_error('Unsupported input format: '.($one_time_encoding ? $one_time_encoding : $this->_api_encoding));
|
|
return false;
|
|
}
|
|
|
|
// No input, no output, what else did you expect?
|
|
if (empty($decoded)) return '';
|
|
|
|
// Anchors for iteration
|
|
$last_begin = 0;
|
|
// Output string
|
|
$output = '';
|
|
foreach ($decoded as $k => $v) {
|
|
// Make sure to use just the plain dot
|
|
switch($v) {
|
|
case 0x3002:
|
|
case 0xFF0E:
|
|
case 0xFF61:
|
|
$decoded[$k] = 0x2E;
|
|
// Right, no break here, the above are converted to dots anyway
|
|
// Stumbling across an anchoring character
|
|
case 0x2E:
|
|
case 0x2F:
|
|
case 0x3A:
|
|
case 0x3F:
|
|
case 0x40:
|
|
// Neither email addresses nor URLs allowed in strict mode
|
|
if ($this->_strict_mode) {
|
|
$this->_error('Neither email addresses nor URLs are allowed in strict mode.');
|
|
return false;
|
|
} else {
|
|
// Skip first char
|
|
if ($k) {
|
|
$encoded = '';
|
|
$encoded = $this->_encode(array_slice($decoded, $last_begin, (($k)-$last_begin)));
|
|
if ($encoded) {
|
|
$output .= $encoded;
|
|
} else {
|
|
$output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k)-$last_begin)));
|
|
}
|
|
$output .= chr($decoded[$k]);
|
|
}
|
|
$last_begin = $k + 1;
|
|
}
|
|
}
|
|
}
|
|
// Catch the rest of the string
|
|
if ($last_begin) {
|
|
$inp_len = sizeof($decoded);
|
|
$encoded = '';
|
|
$encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
|
|
if ($encoded) {
|
|
$output .= $encoded;
|
|
} else {
|
|
$output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
|
|
}
|
|
return $output;
|
|
} else {
|
|
if ($output = $this->_encode($decoded)) {
|
|
return $output;
|
|
} else {
|
|
return $this->_ucs4_to_utf8($decoded);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Use this method to get the last error ocurred
|
|
* @param void
|
|
* @return string The last error, that occured
|
|
* @access public
|
|
*/
|
|
function get_last_error()
|
|
{
|
|
return $this->_error;
|
|
}
|
|
|
|
/**
|
|
* The actual decoding algorithm
|
|
* @access private
|
|
*/
|
|
function _decode($encoded)
|
|
{
|
|
// We do need to find the Punycode prefix
|
|
if (!preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $encoded)) {
|
|
$this->_error('This is not a punycode string');
|
|
return false;
|
|
}
|
|
$encode_test = preg_replace('!^'.preg_quote($this->_punycode_prefix, '!').'!', '', $encoded);
|
|
// If nothing left after removing the prefix, it is hopeless
|
|
if (!$encode_test) {
|
|
$this->_error('The given encoded string was empty');
|
|
return false;
|
|
}
|
|
// Find last occurence of the delimiter
|
|
$delim_pos = strrpos($encoded, '-');
|
|
if ($delim_pos > strlen($this->_punycode_prefix)) {
|
|
for ($k = strlen($this->_punycode_prefix); $k < $delim_pos; ++$k) {
|
|
$decoded[] = ord($encoded{$k});
|
|
}
|
|
} else {
|
|
$decoded = array();
|
|
}
|
|
$deco_len = count($decoded);
|
|
$enco_len = strlen($encoded);
|
|
|
|
// Wandering through the strings; init
|
|
$is_first = true;
|
|
$bias = $this->_initial_bias;
|
|
$idx = 0;
|
|
$char = $this->_initial_n;
|
|
|
|
for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
|
|
for ($old_idx = $idx, $w = 1, $k = $this->_base; 1 ; $k += $this->_base) {
|
|
$digit = $this->_decode_digit($encoded{$enco_idx++});
|
|
$idx += $digit * $w;
|
|
$t = ($k <= $bias) ? $this->_tmin :
|
|
(($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias));
|
|
if ($digit < $t) break;
|
|
$w = (int) ($w * ($this->_base - $t));
|
|
}
|
|
$bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first);
|
|
$is_first = false;
|
|
$char += (int) ($idx / ($deco_len + 1));
|
|
$idx %= ($deco_len + 1);
|
|
if ($deco_len > 0) {
|
|
// Make room for the decoded char
|
|
for ($i = $deco_len; $i > $idx; $i--) {
|
|
$decoded[$i] = $decoded[($i - 1)];
|
|
}
|
|
}
|
|
$decoded[$idx++] = $char;
|
|
}
|
|
return $this->_ucs4_to_utf8($decoded);
|
|
}
|
|
|
|
/**
|
|
* The actual encoding algorithm
|
|
* @access private
|
|
*/
|
|
function _encode($decoded)
|
|
{
|
|
// We cannot encode a domain name containing the Punycode prefix
|
|
$extract = strlen($this->_punycode_prefix);
|
|
$check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix);
|
|
$check_deco = array_slice($decoded, 0, $extract);
|
|
|
|
if ($check_pref == $check_deco) {
|
|
$this->_error('This is already a punycode string');
|
|
return false;
|
|
}
|
|
// We will not try to encode strings consisting of basic code points only
|
|
$encodable = false;
|
|
foreach ($decoded as $k => $v) {
|
|
if ($v > 0x7a) {
|
|
$encodable = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!$encodable) {
|
|
$this->_error('The given string does not contain encodable chars');
|
|
return false;
|
|
}
|
|
|
|
// Do NAMEPREP
|
|
$decoded = $this->_nameprep($decoded);
|
|
if (!$decoded || !is_array($decoded)) return false; // NAMEPREP failed
|
|
|
|
$deco_len = count($decoded);
|
|
if (!$deco_len) return false; // Empty array
|
|
|
|
$codecount = 0; // How many chars have been consumed
|
|
|
|
$encoded = '';
|
|
// Copy all basic code points to output
|
|
for ($i = 0; $i < $deco_len; ++$i) {
|
|
$test = $decoded[$i];
|
|
// Will match [-0-9a-zA-Z]
|
|
if ((0x2F < $test && $test < 0x40) || (0x40 < $test && $test < 0x5B)
|
|
|| (0x60 < $test && $test <= 0x7B) || (0x2D == $test)) {
|
|
$encoded .= chr($decoded[$i]);
|
|
$codecount++;
|
|
}
|
|
}
|
|
if ($codecount == $deco_len) return $encoded; // All codepoints were basic ones
|
|
|
|
// Start with the prefix; copy it to output
|
|
$encoded = $this->_punycode_prefix.$encoded;
|
|
|
|
// If we have basic code points in output, add an hyphen to the end
|
|
if ($codecount) $encoded .= '-';
|
|
|
|
// Now find and encode all non-basic code points
|
|
$is_first = true;
|
|
$cur_code = $this->_initial_n;
|
|
$bias = $this->_initial_bias;
|
|
$delta = 0;
|
|
while ($codecount < $deco_len) {
|
|
// Find the smallest code point >= the current code point and
|
|
// remember the last ouccrence of it in the input
|
|
for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) {
|
|
if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
|
|
$next_code = $decoded[$i];
|
|
}
|
|
}
|
|
|
|
$delta += ($next_code - $cur_code) * ($codecount + 1);
|
|
$cur_code = $next_code;
|
|
|
|
// Scan input again and encode all characters whose code point is $cur_code
|
|
for ($i = 0; $i < $deco_len; $i++) {
|
|
if ($decoded[$i] < $cur_code) {
|
|
$delta++;
|
|
} elseif ($decoded[$i] == $cur_code) {
|
|
for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) {
|
|
$t = ($k <= $bias) ? $this->_tmin :
|
|
(($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias);
|
|
if ($q < $t) break;
|
|
$encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval()
|
|
$q = (int) (($q - $t) / ($this->_base - $t));
|
|
}
|
|
$encoded .= $this->_encode_digit($q);
|
|
$bias = $this->_adapt($delta, $codecount+1, $is_first);
|
|
$codecount++;
|
|
$delta = 0;
|
|
$is_first = false;
|
|
}
|
|
}
|
|
$delta++;
|
|
$cur_code++;
|
|
}
|
|
return $encoded;
|
|
}
|
|
|
|
/**
|
|
* Adapt the bias according to the current code point and position
|
|
* @access private
|
|
*/
|
|
function _adapt($delta, $npoints, $is_first)
|
|
{
|
|
$delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2));
|
|
$delta += intval($delta / $npoints);
|
|
for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) {
|
|
$delta = intval($delta / ($this->_base - $this->_tmin));
|
|
}
|
|
return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew));
|
|
}
|
|
|
|
/**
|
|
* Encoding a certain digit
|
|
* @access private
|
|
*/
|
|
function _encode_digit($d)
|
|
{
|
|
return chr($d + 22 + 75 * ($d < 26));
|
|
}
|
|
|
|
/**
|
|
* Decode a certain digit
|
|
* @access private
|
|
*/
|
|
function _decode_digit($cp)
|
|
{
|
|
$cp = ord($cp);
|
|
return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base));
|
|
}
|
|
|
|
/**
|
|
* Internal error handling method
|
|
* @access private
|
|
*/
|
|
function _error($error = '')
|
|
{
|
|
$this->_error = $error;
|
|
}
|
|
|
|
/**
|
|
* Do Nameprep according to RFC3491 and RFC3454
|
|
* @param array Unicode Characters
|
|
* @return string Unicode Characters, Nameprep'd
|
|
* @access private
|
|
*/
|
|
function _nameprep($input)
|
|
{
|
|
$output = array();
|
|
$error = false;
|
|
//
|
|
// Mapping
|
|
// Walking through the input array, performing the required steps on each of
|
|
// the input chars and putting the result into the output array
|
|
// While mapping required chars we apply the cannonical ordering
|
|
foreach ($input as $v) {
|
|
// Map to nothing == skip that code point
|
|
if (in_array($v, $this->NP['map_nothing'])) continue;
|
|
|
|
// Try to find prohibited input
|
|
if (in_array($v, $this->NP['prohibit']) || in_array($v, $this->NP['general_prohibited'])) {
|
|
$this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
|
|
return false;
|
|
}
|
|
foreach ($this->NP['prohibit_ranges'] as $range) {
|
|
if ($range[0] <= $v && $v <= $range[1]) {
|
|
$this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
|
|
return false;
|
|
}
|
|
}
|
|
//
|
|
// Hangul syllable decomposition
|
|
if (0xAC00 <= $v && $v <= 0xD7AF) {
|
|
foreach ($this->_hangul_decompose($v) as $out) {
|
|
$output[] = (int) $out;
|
|
}
|
|
// There's a decomposition mapping for that code point
|
|
} elseif (isset($this->NP['replacemaps'][$v])) {
|
|
foreach ($this->_apply_cannonical_ordering($this->NP['replacemaps'][$v]) as $out) {
|
|
$output[] = (int) $out;
|
|
}
|
|
} else {
|
|
$output[] = (int) $v;
|
|
}
|
|
}
|
|
// Before applying any Combining, try to rearrange any Hangul syllables
|
|
$output = $this->_hangul_compose($output);
|
|
//
|
|
// Combine code points
|
|
//
|
|
$last_class = 0;
|
|
$last_starter = 0;
|
|
$out_len = count($output);
|
|
for ($i = 0; $i < $out_len; ++$i) {
|
|
$class = $this->_get_combining_class($output[$i]);
|
|
if ((!$last_class || $last_class > $class) && $class) {
|
|
// Try to match
|
|
$seq_len = $i - $last_starter;
|
|
$out = $this->_combine(array_slice($output, $last_starter, $seq_len));
|
|
// On match: Replace the last starter with the composed character and remove
|
|
// the now redundant non-starter(s)
|
|
if ($out) {
|
|
$output[$last_starter] = $out;
|
|
if (count($out) != $seq_len) {
|
|
for ($j = $i+1; $j < $out_len; ++$j) {
|
|
$output[$j-1] = $output[$j];
|
|
}
|
|
unset($output[$out_len]);
|
|
}
|
|
// Rewind the for loop by one, since there can be more possible compositions
|
|
$i--;
|
|
$out_len--;
|
|
$last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i-1]);
|
|
continue;
|
|
}
|
|
}
|
|
// The current class is 0
|
|
if (!$class) $last_starter = $i;
|
|
$last_class = $class;
|
|
}
|
|
return $output;
|
|
}
|
|
|
|
/**
|
|
* Decomposes a Hangul syllable
|
|
* (see http://www.unicode.org/unicode/reports/tr15/#Hangul
|
|
* @param integer 32bit UCS4 code point
|
|
* @return array Either Hangul Syllable decomposed or original 32bit value as one value array
|
|
* @access private
|
|
*/
|
|
function _hangul_decompose($char)
|
|
{
|
|
$sindex = (int) $char - $this->_sbase;
|
|
if ($sindex < 0 || $sindex >= $this->_scount) {
|
|
return array($char);
|
|
}
|
|
$result = array();
|
|
$result[] = (int) $this->_lbase + $sindex / $this->_ncount;
|
|
$result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount;
|
|
$T = intval($this->_tbase + $sindex % $this->_tcount);
|
|
if ($T != $this->_tbase) $result[] = $T;
|
|
return $result;
|
|
}
|
|
/**
|
|
* Ccomposes a Hangul syllable
|
|
* (see http://www.unicode.org/unicode/reports/tr15/#Hangul
|
|
* @param array Decomposed UCS4 sequence
|
|
* @return array UCS4 sequence with syllables composed
|
|
* @access private
|
|
*/
|
|
function _hangul_compose($input)
|
|
{
|
|
$inp_len = count($input);
|
|
if (!$inp_len) return array();
|
|
$result = array();
|
|
$last = (int) $input[0];
|
|
$result[] = $last; // copy first char from input to output
|
|
|
|
for ($i = 1; $i < $inp_len; ++$i) {
|
|
$char = (int) $input[$i];
|
|
$sindex = $last - $this->_sbase;
|
|
$lindex = $last - $this->_lbase;
|
|
$vindex = $char - $this->_vbase;
|
|
$tindex = $char - $this->_tbase;
|
|
// Find out, whether two current characters are LV and T
|
|
if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0)
|
|
&& 0 <= $tindex && $tindex <= $this->_tcount) {
|
|
// create syllable of form LVT
|
|
$last += $tindex;
|
|
$result[(count($result) - 1)] = $last; // reset last
|
|
continue; // discard char
|
|
}
|
|
// Find out, whether two current characters form L and V
|
|
if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) {
|
|
// create syllable of form LV
|
|
$last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount;
|
|
$result[(count($result) - 1)] = $last; // reset last
|
|
continue; // discard char
|
|
}
|
|
// if neither case was true, just add the character
|
|
$last = $char;
|
|
$result[] = $char;
|
|
}
|
|
return $result;
|
|
}
|
|
|
|
/**
|
|
* Returns the combining class of a certain wide char
|
|
* @param integer Wide char to check (32bit integer)
|
|
* @return integer Combining class if found, else 0
|
|
* @access private
|
|
*/
|
|
function _get_combining_class($char)
|
|
{
|
|
return isset($this->NP['norm_combcls'][$char]) ? $this->NP['norm_combcls'][$char] : 0;
|
|
}
|
|
|
|
/**
|
|
* Apllies the cannonical ordering of a decomposed UCS4 sequence
|
|
* @param array Decomposed UCS4 sequence
|
|
* @return array Ordered USC4 sequence
|
|
* @access private
|
|
*/
|
|
function _apply_cannonical_ordering($input)
|
|
{
|
|
$swap = true;
|
|
$size = count($input);
|
|
while ($swap) {
|
|
$swap = false;
|
|
$last = $this->_get_combining_class(intval($input[0]));
|
|
for ($i = 0; $i < $size-1; ++$i) {
|
|
$next = $this->_get_combining_class(intval($input[$i+1]));
|
|
if ($next != 0 && $last > $next) {
|
|
// Move item leftward until it fits
|
|
for ($j = $i + 1; $j > 0; --$j) {
|
|
if ($this->_get_combining_class(intval($input[$j-1])) <= $next) break;
|
|
$t = intval($input[$j]);
|
|
$input[$j] = intval($input[$j-1]);
|
|
$input[$j-1] = $t;
|
|
$swap = true;
|
|
}
|
|
// Reentering the loop looking at the old character again
|
|
$next = $last;
|
|
}
|
|
$last = $next;
|
|
}
|
|
}
|
|
return $input;
|
|
}
|
|
|
|
/**
|
|
* Do composition of a sequence of starter and non-starter
|
|
* @param array UCS4 Decomposed sequence
|
|
* @return array Ordered USC4 sequence
|
|
* @access private
|
|
*/
|
|
function _combine($input)
|
|
{
|
|
$inp_len = count($input);
|
|
foreach ($this->NP['replacemaps'] as $np_src => $np_target) {
|
|
if ($np_target[0] != $input[0]) continue;
|
|
if (count($np_target) != $inp_len) continue;
|
|
$hit = false;
|
|
foreach ($input as $k2 => $v2) {
|
|
if ($v2 == $np_target[$k2]) {
|
|
$hit = true;
|
|
} else {
|
|
$hit = false;
|
|
break;
|
|
}
|
|
}
|
|
if ($hit) return $np_src;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* This converts an UTF-8 encoded string to its UCS-4 representation
|
|
* By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
|
|
* each of the "chars". This is due to PHP not being able to handle strings with
|
|
* bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
|
|
* The following UTF-8 encodings are supported:
|
|
* bytes bits representation
|
|
* 1 7 0xxxxxxx
|
|
* 2 11 110xxxxx 10xxxxxx
|
|
* 3 16 1110xxxx 10xxxxxx 10xxxxxx
|
|
* 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
* 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
* 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
* Each x represents a bit that can be used to store character data.
|
|
* The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
|
|
* @access private
|
|
*/
|
|
function _utf8_to_ucs4($input)
|
|
{
|
|
$output = array();
|
|
$out_len = 0;
|
|
$inp_len = strlen($input);
|
|
$mode = 'next';
|
|
$test = 'none';
|
|
for ($k = 0; $k < $inp_len; ++$k) {
|
|
$v = ord($input{$k}); // Extract byte from input string
|
|
|
|
if ($v < 128) { // We found an ASCII char - put into stirng as is
|
|
$output[$out_len] = $v;
|
|
++$out_len;
|
|
if ('add' == $mode) {
|
|
$this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
|
|
return false;
|
|
}
|
|
continue;
|
|
}
|
|
if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
|
|
$start_byte = $v;
|
|
$mode = 'add';
|
|
$test = 'range';
|
|
if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
|
|
$next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
|
|
$v = ($v - 192) << 6;
|
|
} elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
|
|
$next_byte = 1;
|
|
$v = ($v - 224) << 12;
|
|
} elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
$next_byte = 2;
|
|
$v = ($v - 240) << 18;
|
|
} elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
$next_byte = 3;
|
|
$v = ($v - 248) << 24;
|
|
} elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
$next_byte = 4;
|
|
$v = ($v - 252) << 30;
|
|
} else {
|
|
$this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k);
|
|
return false;
|
|
}
|
|
if ('add' == $mode) {
|
|
$output[$out_len] = (int) $v;
|
|
++$out_len;
|
|
continue;
|
|
}
|
|
}
|
|
if ('add' == $mode) {
|
|
if (!$this->_allow_overlong && $test == 'range') {
|
|
$test = 'none';
|
|
if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
|
|
$this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
|
|
return false;
|
|
}
|
|
}
|
|
if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
|
|
$v = ($v - 128) << ($next_byte * 6);
|
|
$output[($out_len - 1)] += $v;
|
|
--$next_byte;
|
|
} else {
|
|
$this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
|
|
return false;
|
|
}
|
|
if ($next_byte < 0) {
|
|
$mode = 'next';
|
|
}
|
|
}
|
|
} // for
|
|
return $output;
|
|
}
|
|
|
|
/**
|
|
* Convert UCS-4 string into UTF-8 string
|
|
* See _utf8_to_ucs4() for details
|
|
* @access private
|
|
*/
|
|
function _ucs4_to_utf8($input)
|
|
{
|
|
$output = '';
|
|
$k = 0;
|
|
foreach ($input as $v) {
|
|
++$k;
|
|
// $v = ord($v);
|
|
if ($v < 128) { // 7bit are transferred literally
|
|
$output .= chr($v);
|
|
} elseif ($v < (1 << 11)) { // 2 bytes
|
|
$output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63));
|
|
} elseif ($v < (1 << 16)) { // 3 bytes
|
|
$output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
|
|
} elseif ($v < (1 << 21)) { // 4 bytes
|
|
$output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63))
|
|
. chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
|
|
} elseif ($v < (1 << 26)) { // 5 bytes
|
|
$output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63))
|
|
. chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63))
|
|
. chr(128 + ($v & 63));
|
|
} elseif ($v < (1 << 31)) { // 6 bytes
|
|
$output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63))
|
|
. chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63))
|
|
. chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
|
|
} else {
|
|
$this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
|
|
return false;
|
|
}
|
|
}
|
|
return $output;
|
|
}
|
|
|
|
/**
|
|
* Convert UCS-4 array into UCS-4 string
|
|
*
|
|
* @access private
|
|
*/
|
|
function _ucs4_to_ucs4_string($input)
|
|
{
|
|
$output = '';
|
|
// Take array values and split output to 4 bytes per value
|
|
// The bit mask is 255, which reads &11111111
|
|
foreach ($input as $v) {
|
|
$output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255);
|
|
}
|
|
return $output;
|
|
}
|
|
|
|
/**
|
|
* Convert UCS-4 strin into UCS-4 garray
|
|
*
|
|
* @access private
|
|
*/
|
|
function _ucs4_string_to_ucs4($input)
|
|
{
|
|
$output = array();
|
|
$inp_len = strlen($input);
|
|
// Input length must be dividable by 4
|
|
if ($inp_len % 4) {
|
|
$this->_error('Input UCS4 string is broken');
|
|
return false;
|
|
}
|
|
// Empty input - return empty output
|
|
if (!$inp_len) return $output;
|
|
for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
|
|
// Increment output position every 4 input bytes
|
|
if (!($i % 4)) {
|
|
$out_len++;
|
|
$output[$out_len] = 0;
|
|
}
|
|
$output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) );
|
|
}
|
|
return $output;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Adapter class for aligning the API of idna_convert with that of Net_IDNA
|
|
* @author Matthias Sommerfeld <mso@phlylabs.de>
|
|
*/
|
|
class Net_IDNA_php4 extends idna_convert
|
|
{
|
|
/**
|
|
* Sets a new option value. Available options and values:
|
|
* [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
|
|
* 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
|
|
* [overlong - Unicode does not allow unnecessarily long encodings of chars,
|
|
* to allow this, set this parameter to true, else to false;
|
|
* default is false.]
|
|
* [strict - true: strict mode, good for registration purposes - Causes errors
|
|
* on failures; false: loose mode, ideal for "wildlife" applications
|
|
* by silently ignoring errors and returning the original input instead
|
|
*
|
|
* @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
|
|
* @param string Value to use (if parameter 1 is a string)
|
|
* @return boolean true on success, false otherwise
|
|
* @access public
|
|
*/
|
|
function setParams($option, $param = false)
|
|
{
|
|
return $this->IC->set_parameters($option, $param);
|
|
}
|
|
}
|
|
|
|
?>
|