# encoding:utf-8 #-- # Copyright (C) 2006-2013 Bob Aman # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #++ module Addressable module IDNA # This module is loosely based on idn_actionmailer by Mick Staugaard, # the unicode library by Yoshida Masato, and the punycode implementation # by Kazuhiro Nishiyama. Most of the code was copied verbatim, but # some reformatting was done, and some translation from C was done. # # Without their code to work from as a base, we'd all still be relying # on the presence of libidn. Which nobody ever seems to have installed. # # Original sources: # http://github.com/staugaard/idn_actionmailer # http://www.yoshidam.net/Ruby.html#unicode # http://rubyforge.org/frs/?group_id=2550 UNICODE_TABLE = File.expand_path( File.join(File.dirname(__FILE__), '../../..', 'data/unicode.data') ) ACE_PREFIX = "xn--" UTF8_REGEX = /\A(?: [\x09\x0A\x0D\x20-\x7E] # ASCII | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4nil5 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 )*\z/mnx UTF8_REGEX_MULTIBYTE = /(?: [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4nil5 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 )/mnx # :startdoc: # Converts from a Unicode internationalized domain name to an ASCII # domain name as described in RFC 3490. def self.to_ascii(input) input = input.dup if input.respond_to?(:force_encoding) input.force_encoding(Encoding::ASCII_8BIT) end if input =~ UTF8_REGEX && input =~ UTF8_REGEX_MULTIBYTE parts = unicode_downcase(input).split('.') parts.map! do |part| if part.respond_to?(:force_encoding) part.force_encoding(Encoding::ASCII_8BIT) end if part =~ UTF8_REGEX && part =~ UTF8_REGEX_MULTIBYTE ACE_PREFIX + punycode_encode(unicode_normalize_kc(part)) else part end end parts.join('.') else input end end # Converts from an ASCII domain name to a Unicode internationalized # domain name as described in RFC 3490. def self.to_unicode(input) parts = input.split('.') parts.map! do |part| if part =~ /^#{ACE_PREFIX}/ punycode_decode(part[/^#{ACE_PREFIX}(.+)/, 1]) else part end end output = parts.join('.') if output.respond_to?(:force_encoding) output.force_encoding(Encoding::UTF_8) end output end # Unicode normalization form KC. def self.unicode_normalize_kc(input) input = input.to_s unless input.is_a?(String) unpacked = input.unpack("U*") unpacked = unicode_compose(unicode_sort_canonical(unicode_decompose(unpacked))) return unpacked.pack("U*") end ## # Unicode aware downcase method. # # @api private # @param [String] input # The input string. # @return [String] The downcased result. def self.unicode_downcase(input) unpacked = input.unpack("U*") unpacked.map! { |codepoint| lookup_unicode_lowercase(codepoint) } return unpacked.pack("U*") end (class <= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT && ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT # Hangul L + V return HANGUL_SBASE + ( (ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE) ) * HANGUL_TCOUNT elsif ch_one >= HANGUL_SBASE && ch_one < HANGUL_SBASE + HANGUL_SCOUNT && (ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 && ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT # Hangul LV + T return ch_one + (ch_two - HANGUL_TBASE) end p = [] ucs4_to_utf8 = lambda do |ch| # For some reason, rcov likes to drop BUS errors here. if ch < 128 p << ch elsif ch < 2048 p << (ch >> 6 | 192) p << (ch & 63 | 128) elsif ch < 0x10000 p << (ch >> 12 | 224) p << (ch >> 6 & 63 | 128) p << (ch & 63 | 128) elsif ch < 0x200000 p << (ch >> 18 | 240) p << (ch >> 12 & 63 | 128) p << (ch >> 6 & 63 | 128) p << (ch & 63 | 128) elsif ch < 0x4000000 p << (ch >> 24 | 248) p << (ch >> 18 & 63 | 128) p << (ch >> 12 & 63 | 128) p << (ch >> 6 & 63 | 128) p << (ch & 63 | 128) elsif ch < 0x80000000 p << (ch >> 30 | 252) p << (ch >> 24 & 63 | 128) p << (ch >> 18 & 63 | 128) p << (ch >> 12 & 63 | 128) p << (ch >> 6 & 63 | 128) p << (ch & 63 | 128) end end ucs4_to_utf8.call(ch_one) ucs4_to_utf8.call(ch_two) return lookup_unicode_composition(p) end (class < cc unpacked[i] = last unpacked[i-1] = ch i -= 1 if i > 1 else i += 1 end end return unpacked end (class <= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT l, v, t = unicode_decompose_hangul(cp) unpacked_result << l unpacked_result << v if v unpacked_result << t if t else dc = lookup_unicode_compatibility(cp) unless dc unpacked_result << cp else unpacked_result.concat(unicode_decompose(dc.unpack("U*"))) end end end return unpacked_result end (class <= HANGUL_SCOUNT l = codepoint v = t = nil return l, v, t end l = HANGUL_LBASE + sindex / HANGUL_NCOUNT v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT t = HANGUL_TBASE + sindex % HANGUL_TCOUNT if t == HANGUL_TBASE t = nil end return l, v, t end (class <?" + "@ABCDEFGHIJKLMNO" + "PQRSTUVWXYZ[\\]^_" + "`abcdefghijklmno" + "pqrstuvwxyz{|}~\n" # Input is invalid. class PunycodeBadInput < StandardError; end # Output would exceed the space provided. class PunycodeBigOutput < StandardError; end # Input needs wider integers to process. class PunycodeOverflow < StandardError; end def self.punycode_encode(unicode) input = unicode.unpack("U*") output = [0] * (ACE_MAX_LENGTH + 1) input_length = input.size output_length = [ACE_MAX_LENGTH] # Initialize the state n = PUNYCODE_INITIAL_N delta = out = 0 max_out = output_length[0] bias = PUNYCODE_INITIAL_BIAS # Handle the basic code points: input_length.times do |j| if punycode_basic?(input[j]) if max_out - out < 2 raise PunycodeBigOutput, "Output would exceed the space provided." end output[out] = input[j] out += 1 end end h = b = out # h is the number of code points that have been handled, b is the # number of basic code points, and out is the number of characters # that have been output. if b > 0 output[out] = PUNYCODE_DELIMITER out += 1 end # Main encoding loop: while h < input_length # All non-basic code points < n have been # handled already. Find the next larger one: m = PUNYCODE_MAXINT input_length.times do |j| m = input[j] if (n...m) === input[j] end # Increase delta enough to advance the decoder's # state to , but guard against overflow: if m - n > (PUNYCODE_MAXINT - delta) / (h + 1) raise PunycodeOverflow, "Input needs wider integers to process." end delta += (m - n) * (h + 1) n = m input_length.times do |j| # Punycode does not need to check whether input[j] is basic: if input[j] < n delta += 1 if delta == 0 raise PunycodeOverflow, "Input needs wider integers to process." end end if input[j] == n # Represent delta as a generalized variable-length integer: q = delta; k = PUNYCODE_BASE while true if out >= max_out raise PunycodeBigOutput, "Output would exceed the space provided." end t = ( if k <= bias PUNYCODE_TMIN elsif k >= bias + PUNYCODE_TMAX PUNYCODE_TMAX else k - bias end ) break if q < t output[out] = punycode_encode_digit(t + (q - t) % (PUNYCODE_BASE - t)) out += 1 q = (q - t) / (PUNYCODE_BASE - t) k += PUNYCODE_BASE end output[out] = punycode_encode_digit(q) out += 1 bias = punycode_adapt(delta, h + 1, h == b) delta = 0 h += 1 end end delta += 1 n += 1 end output_length[0] = out outlen = out outlen.times do |j| c = output[j] unless c >= 0 && c <= 127 raise Exception, "Invalid output char." end unless PUNYCODE_PRINT_ASCII[c] raise PunycodeBadInput, "Input is invalid." end end output[0..outlen].map { |x| x.chr }.join("").sub(/\0+\z/, "") end (class <= 0 && c <= 127 raise PunycodeBadInput, "Input is invalid." end input.push(c) end input_length = input.length output_length = [UNICODE_MAX_LENGTH] # Initialize the state n = PUNYCODE_INITIAL_N out = i = 0 max_out = output_length[0] bias = PUNYCODE_INITIAL_BIAS # Handle the basic code points: Let b be the number of input code # points before the last delimiter, or 0 if there is none, then # copy the first b code points to the output. b = 0 input_length.times do |j| b = j if punycode_delimiter?(input[j]) end if b > max_out raise PunycodeBigOutput, "Output would exceed the space provided." end b.times do |j| unless punycode_basic?(input[j]) raise PunycodeBadInput, "Input is invalid." end output[out] = input[j] out+=1 end # Main decoding loop: Start just after the last delimiter if any # basic code points were copied; start at the beginning otherwise. in_ = b > 0 ? b + 1 : 0 while in_ < input_length # in_ is the index of the next character to be consumed, and # out is the number of code points in the output array. # Decode a generalized variable-length integer into delta, # which gets added to i. The overflow checking is easier # if we increase i as we go, then subtract off its starting # value at the end to obtain delta. oldi = i; w = 1; k = PUNYCODE_BASE while true if in_ >= input_length raise PunycodeBadInput, "Input is invalid." end digit = punycode_decode_digit(input[in_]) in_+=1 if digit >= PUNYCODE_BASE raise PunycodeBadInput, "Input is invalid." end if digit > (PUNYCODE_MAXINT - i) / w raise PunycodeOverflow, "Input needs wider integers to process." end i += digit * w t = ( if k <= bias PUNYCODE_TMIN elsif k >= bias + PUNYCODE_TMAX PUNYCODE_TMAX else k - bias end ) break if digit < t if w > PUNYCODE_MAXINT / (PUNYCODE_BASE - t) raise PunycodeOverflow, "Input needs wider integers to process." end w *= PUNYCODE_BASE - t k += PUNYCODE_BASE end bias = punycode_adapt(i - oldi, out + 1, oldi == 0) # I was supposed to wrap around from out + 1 to 0, # incrementing n each time, so we'll fix that now: if i / (out + 1) > PUNYCODE_MAXINT - n raise PunycodeOverflow, "Input needs wider integers to process." end n += i / (out + 1) i %= out + 1 # Insert n at position i of the output: # not needed for Punycode: # raise PUNYCODE_INVALID_INPUT if decode_digit(n) <= base if out >= max_out raise PunycodeBigOutput, "Output would exceed the space provided." end #memmove(output + i + 1, output + i, (out - i) * sizeof *output) output[i + 1, out - i] = output[i, out - i] output[i] = n i += 1 out += 1 end output_length[0] = out output.pack("U*") end (class <> 1 # delta >> 1 is a faster way of doing delta / 2 delta += delta / numpoints difference = PUNYCODE_BASE - PUNYCODE_TMIN k = 0 while delta > (difference * PUNYCODE_TMAX) / 2 delta /= difference k += PUNYCODE_BASE end k + (difference + 1) * delta / (delta + PUNYCODE_SKEW) end (class <