Better support for unicode path and values.

Added WSF_REQUEST.percent_encoded_path_info: READABLE_STRING_8
    to keep url encoded path info, as it is useful for specific component

The router is now using WSF_REQUEST.percent_encoded_path_info
    since URI_TEMPLATE are handling URI (and not IRI)
    this fixes an issue with unicode path parameters.

This should not break existing code, and this fixes various unicode related issues related
   to PATH parameter and path info
   but also any component using file names.

(required EiffelStudio >= 7.2)
This commit is contained in:
2013-06-12 18:00:55 +02:00
parent a982286dd4
commit cc4ef1a575
28 changed files with 1056 additions and 449 deletions

View File

@@ -0,0 +1,523 @@
note
description: "[
Component to handle percent encoding
]"
date: "$Date: 2013-05-21 01:15:17 +0200 (mar., 21 mai 2013) $"
revision: "$Revision: 92557 $"
EIS: "name=Percent-encoding", "protocol=URI", "src=http://en.wikipedia.org/wiki/Percent-encoding"
class
WSF_PERCENT_ENCODER
feature -- Percent encoding
percent_encoded_string (v: READABLE_STRING_GENERAL): STRING_8
-- Return `a_string' percent-encoded
do
create Result.make (v.count)
append_percent_encoded_string_to (v, Result)
end
append_percent_encoded_string_to (s: READABLE_STRING_GENERAL; a_result: STRING_GENERAL)
-- Append `a_string' as percent-encoded value to `a_result'
local
c: NATURAL_32
i,n: INTEGER
do
from
i := 1
n := s.count
until
i > n
loop
c := s.code (i)
if
--| unreserved ALPHA / DIGIT
(48 <= c and c <= 57) -- DIGIT: 0 .. 9
or (65 <= c and c <= 90) -- ALPHA: A .. Z
or (97 <= c and c <= 122) -- ALPHA: a .. z
then
a_result.append_code (c)
else
inspect c
when
45, 46, 95, 126 -- unreserved characters: -._~
then
a_result.append_code (c)
when
58, 64, -- reserved =+ gen-delims: :@
33, 36, 38, 39, 40, 41, 42, -- reserved =+ sub-delims: !$&'()*
43, 44, 59, 61, -- reserved = sub-delims: +,;=
37 -- percent encoding: %
then
append_percent_encoded_character_code_to (c, a_result)
else
append_percent_encoded_character_code_to (c, a_result)
end
end
i := i + 1
end
end
feature -- Percent encoding: character
append_percent_encoded_character_code_to (a_code: NATURAL_32; a_result: STRING_GENERAL)
-- Append character code `a_code' as percent-encoded content into `a_result'
do
if a_code > 0xFF then
-- Unicode
append_percent_encoded_unicode_character_code_to (a_code, a_result)
elseif a_code > 0x7F then
-- Extended ASCII
-- This requires percent-encoding on UTF-8 converted character.
append_percent_encoded_unicode_character_code_to (a_code, a_result)
else
-- ASCII
append_percent_encoded_ascii_character_code_to (a_code, a_result)
end
ensure
appended: a_result.count > old a_result.count
end
feature {NONE} -- Implementation: character encoding
append_percent_encoded_ascii_character_code_to (a_code: NATURAL_32; a_result: STRING_GENERAL)
-- Append extended ascii character code `a_code' as percent-encoded content into `a_result'
-- Note: it does not UTF-8 convert this extended ASCII.
require
is_extended_ascii: a_code <= 0xFF
local
c: INTEGER
do
if a_code > 0xFF then
-- Unicode
append_percent_encoded_unicode_character_code_to (a_code, a_result)
else
-- Extended ASCII
c := a_code.to_integer_32
a_result.append_code (37) -- 37 '%%'
a_result.append_code (hex_digit [c |>> 4])
a_result.append_code (hex_digit [c & 0xF])
end
ensure
appended: a_result.count > old a_result.count
end
append_percent_encoded_unicode_character_code_to (a_code: NATURAL_32; a_result: STRING_GENERAL)
-- Append Unicode character code `a_code' as UTF-8 and percent-encoded content into `a_result'
-- Note: it does include UTF-8 conversion of extended ASCII and Unicode.
do
if a_code <= 0x7F then
-- 0xxxxxxx
append_percent_encoded_ascii_character_code_to (a_code, a_result)
elseif a_code <= 0x7FF then
-- 110xxxxx 10xxxxxx
append_percent_encoded_ascii_character_code_to ((a_code |>> 6) | 0xC0, a_result)
append_percent_encoded_ascii_character_code_to ((a_code & 0x3F) | 0x80, a_result)
elseif a_code <= 0xFFFF then
-- 1110xxxx 10xxxxxx 10xxxxxx
append_percent_encoded_ascii_character_code_to ((a_code |>> 12) | 0xE0, a_result)
append_percent_encoded_ascii_character_code_to (((a_code |>> 6) & 0x3F) | 0x80, a_result)
append_percent_encoded_ascii_character_code_to ((a_code & 0x3F) | 0x80, a_result)
else
-- c <= 1FFFFF - there are no higher code points
-- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
append_percent_encoded_ascii_character_code_to ((a_code |>> 18) | 0xF0, a_result)
append_percent_encoded_ascii_character_code_to (((a_code |>> 12) & 0x3F) | 0x80, a_result)
append_percent_encoded_ascii_character_code_to (((a_code |>> 6) & 0x3F) | 0x80, a_result)
append_percent_encoded_ascii_character_code_to ((a_code & 0x3F) | 0x80, a_result)
end
ensure
appended: a_result.count > old a_result.count
end
feature -- Percent decoding
percent_decoded_string (v: READABLE_STRING_GENERAL): STRING_32
-- Return the percent decoded string equivalent to the percent-encoded string `v'
--| Note that is `a_result' is a STRING_8, any Unicode character will be kept as UTF-8
do
create Result.make (v.count)
append_percent_decoded_string_to (v, Result)
end
append_percent_decoded_string_to (v: READABLE_STRING_GENERAL; a_result: STRING_GENERAL)
-- Append to `a_result' a string equivalent to the percent-encoded string `v'
--| Note that is `a_result' is a STRING_8, any Unicode character will be kept as UTF-8
local
i,n: INTEGER
c: NATURAL_32
pr: CELL [INTEGER]
a_result_is_string_32: BOOLEAN
do
a_result_is_string_32 := attached {STRING_32} a_result
from
i := 1
create pr.put (i)
n := v.count
until
i > n
loop
c := v.code (i)
inspect c
when 43 then -- 43 '+'
-- Some implementation are replacing spaces with "+" instead of "%20"
a_result.append_code (32) -- 32 ' '
when 37 then -- 37 '%%'
-- An escaped character ?
if i = n then -- Error?
a_result.append_code (c)
else
if a_result_is_string_32 then
-- Convert UTF-8 to UTF-32
pr.replace (i)
c := next_percent_decoded_unicode_character_code (v, pr)
a_result.append_code (c)
i := pr.item
else
-- Keep UTF-8
pr.replace (i)
c := next_percent_decoded_character_code (v, pr)
a_result.append_code (c)
i := pr.item
end
end
else
if c <= 0x7F then
a_result.append_code (c)
else
if a_result_is_string_32 then
a_result.append_code (c)
else
append_percent_encoded_character_code_to (c, a_result)
end
end
end
i := i + 1
end
end
feature {NONE} -- Implementation: decoding
next_percent_decoded_character_code (v: READABLE_STRING_GENERAL; a_position: CELL [INTEGER]): NATURAL_32
-- Character decoded from string `v' starting from index `a_position.item'
-- note: it also updates `a_position.item' to indicate the new index position.
require
valid_start: a_position.item <= v.count
is_percent_char: v.code (a_position.item) = 37 -- 37 '%%'
local
c: NATURAL_32
i, n: INTEGER
not_a_digit: BOOLEAN
ascii_pos: NATURAL_32
ival: NATURAL_32
pos: INTEGER
c_is_digit: BOOLEAN
do
--| pos is index in stream of escape character ('%')
pos := a_position.item
c := v.code (pos + 1)
if c = 85 or c = 117 then -- 117 'u' 85 'U'
-- NOTE: this is not a standard, but it can occur, so use this for decoding only
-- An escaped Unicode (ucs2) value, from ECMA scripts
-- has the form: %u<n> where <n> is the UCS value
-- of the character (two byte integer, one to 4 chars
-- after escape sequence).
-- See: http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations
-- UTF-8 result can be 1 to 4 characters.
from
i := pos + 2
n := v.count
until
(i > n) or not_a_digit
loop
c := v.code (i)
c_is_digit := (48 <= c and c <= 57) -- DIGIT: 0 .. 9
if
c_is_digit
or (97 <= c and c <= 102) -- ALPHA: a..f
or (65 <= c and c <= 70) -- ALPHA: A..F
then
ival := ival * 16
if c_is_digit then
ival := ival + (c - 48) -- 48 '0'
else
if c > 70 then -- a..f
ival := ival + (c - 97) + 10 -- 97 'a'
else -- A..F
ival := ival + (c - 65) + 10 -- 65 'A'
end
end
i := i + 1
else
not_a_digit := True
i := i - 1
end
end
a_position.replace (i)
Result := ival
else
-- ASCII char?
ascii_pos := hexadecimal_string_to_natural_32 (v.substring (pos + 1, pos + 2))
Result := ascii_pos
a_position.replace (pos + 2)
end
end
next_percent_decoded_unicode_character_code (v: READABLE_STRING_GENERAL; a_position: CELL [INTEGER]): NATURAL_32
-- Next decoded character from `v' at position `a_position.item'
-- note: it also updates `a_position' to indicate the new index position.
require
valid_start: a_position.item <= v.count
is_percent_char: v.code (a_position.item) = 37 -- 37 '%%'
local
n, j: INTEGER
c: NATURAL_32
c1, c2, c3, c4: NATURAL_32
pr: CELL [INTEGER]
do
create pr.put (a_position.item)
c1 := next_percent_decoded_character_code (v, pr)
j := pr.item
n := v.count
Result := c1
a_position.replace (j)
if c1 <= 0x7F then
-- 0xxxxxxx
Result := c1
elseif c1 <= 0xDF then
-- 110xxxxx 10xxxxxx
if j + 2 <= n then
c := v.code (j + 1)
if c = 37 then -- 37 '%%'
pr.replace (j + 1)
c2 := next_percent_decoded_character_code (v, pr)
j := pr.item
Result := (
((c1 & 0x1F) |<< 6) |
( c2 & 0x3F )
)
a_position.replace (j)
else
-- Do not try to decode
end
end
elseif c1 <= 0xEF then
-- 1110xxxx 10xxxxxx 10xxxxxx
if j + 2 <= n then
c := v.code (j + 1)
if c = 37 then -- 37 '%%'
pr.replace (j + 1)
c2 := next_percent_decoded_character_code (v, pr)
j := pr.item
if j + 2 <= n then
c := v.code (j + 1)
if c = 37 then -- 37 '%%'
pr.replace (j + 1)
c3 := next_percent_decoded_character_code (v, pr)
j := pr.item
Result := (
((c1 & 0xF) |<< 12) |
((c2 & 0x3F) |<< 6) |
( c3 & 0x3F )
)
a_position.replace (j)
else
-- Do not try to decode
end
end
else
-- Do not try to decode
end
end
elseif c1 <= 0xF7 then
-- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if j + 2 <= n then
c := v.code (j + 1)
if c = 37 then -- 37 '%%'
pr.replace (j + 1)
c2 := next_percent_decoded_character_code (v, pr)
j := pr.item
if j + 2 <= n then
c := v.code (j + 1)
if c = 37 then -- 37 '%%'
pr.replace (j + 1)
c3 := next_percent_decoded_character_code (v, pr)
j := pr.item
if j + 2 <= n then
c := v.code (j + 1)
if c = 37 then -- 37 '%%'
pr.replace (j + 1)
c4 := next_percent_decoded_character_code (v, pr)
j := pr.item
a_position.replace (j)
Result := (
((c1 & 0x7) |<< 18 ) |
((c2 & 0x3F) |<< 12) |
((c3 & 0x3F) |<< 6) |
( c4 & 0x3F )
)
else
-- Do not try to decode
end
end
else
-- Do not try to decode
end
end
else
-- Do not try to decode
end
end
else
Result := c1
end
end
feature -- RFC and characters
is_hexa_decimal_character (c: CHARACTER_32): BOOLEAN
-- Is hexadecimal character ?
do
Result := ('a' <= c and c <= 'f') or ('A' <= c and c <= 'F') -- HEXA
or ('0' <= c and c <= '9') -- DIGIT
end
is_alpha_or_digit_character (c: CHARACTER_32): BOOLEAN
-- Is ALPHA or DIGIT character ?
do
Result := ('a' <= c and c <= 'z') or ('A' <= c and c <= 'Z') -- ALPHA
or ('0' <= c and c <= '9') -- DIGIT
end
is_alpha_character (c: CHARACTER_32): BOOLEAN
-- Is ALPHA character ?
do
Result := ('a' <= c and c <= 'z') or ('A' <= c and c <= 'Z')
end
is_digit_character (c: CHARACTER_32): BOOLEAN
-- Is DIGIT character ?
do
Result := ('0' <= c and c <= '9')
end
is_unreserved_character (c: CHARACTER_32): BOOLEAN
-- unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
do
if
('a' <= c and c <= 'z') -- ALPHA
or ('A' <= c and c <= 'Z') -- ALPHA
or ('0' <= c and c <= '9') -- DIGIT
then
Result := True
else
inspect c
when '-', '_', '.', '~' then -- unreserved
Result := True
else
end
end
end
is_reserved_character (c: CHARACTER_32): BOOLEAN
-- reserved = gen-delims / sub-delims
do
Result := is_gen_delims_character (c) or is_sub_delims_character (c)
end
is_gen_delims_character (c: CHARACTER_32): BOOLEAN
-- gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
do
inspect c
when ':' , '/', '?' , '#' , '[' , ']' , '@' then
Result := True
else
end
end
is_sub_delims_character (c: CHARACTER_32): BOOLEAN
-- sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
-- / "*" / "+" / "," / ";" / "="
do
inspect c
when '!' , '$' , '&' , '%'' , '(' , ')' , '*' , '+' , ',' , ';' , '=' then -- sub-delims
Result := True
else
end
end
feature {NONE} -- Implementation
hex_digit: SPECIAL [NATURAL_32]
-- Hexadecimal digits.
once
create Result.make_filled (0, 16)
Result [0] := {NATURAL_32} 48 -- 48 '0'
Result [1] := {NATURAL_32} 49 -- 49 '1'
Result [2] := {NATURAL_32} 50 -- 50 '2'
Result [3] := {NATURAL_32} 51 -- 51 '3'
Result [4] := {NATURAL_32} 52 -- 52 '4'
Result [5] := {NATURAL_32} 53 -- 53 '5'
Result [6] := {NATURAL_32} 54 -- 54 '6'
Result [7] := {NATURAL_32} 55 -- 55 '7'
Result [8] := {NATURAL_32} 56 -- 56 '8'
Result [9] := {NATURAL_32} 57 -- 57 '9'
Result [10] := {NATURAL_32} 65 -- 65 'A'
Result [11] := {NATURAL_32} 66 -- 66 'B'
Result [12] := {NATURAL_32} 67 -- 67 'C'
Result [13] := {NATURAL_32} 68 -- 68 'D'
Result [14] := {NATURAL_32} 69 -- 69 'E'
Result [15] := {NATURAL_32} 70 -- 70 'F'
end
is_hexa_decimal (a_string: READABLE_STRING_GENERAL): BOOLEAN
-- Is `a_string' a valid hexadecimal sequence?
local
l_convertor: like ctoi_convertor
do
l_convertor := ctoi_convertor
l_convertor.parse_string_with_type (a_string, {NUMERIC_INFORMATION}.type_natural_32)
Result := l_convertor.is_integral_integer
end
hexadecimal_string_to_natural_32 (a_hex_string: READABLE_STRING_GENERAL): NATURAL_32
-- Convert hexadecimal value `a_hex_string' to its corresponding NATURAL_32 value.
require
is_hexa: is_hexa_decimal (a_hex_string)
local
l_convertor: like ctoi_convertor
do
l_convertor := ctoi_convertor
l_convertor.parse_string_with_type (a_hex_string, {NUMERIC_INFORMATION}.type_no_limitation)
Result := l_convertor.parsed_natural_32
end
ctoi_convertor: HEXADECIMAL_STRING_TO_INTEGER_CONVERTER
-- Converter used to convert string to integer or natural.
once
create Result.make
Result.set_leading_separators_acceptable (False)
Result.set_trailing_separators_acceptable (False)
ensure
ctoi_convertor_not_void: Result /= Void
end
note
copyright: "2011-2013, Jocelyn Fiat, Javier Velilla, Olivier Ligot, Eiffel Software and others"
license: "Eiffel Forum License v2 (see http://www.eiffel.com/licensing/forum.txt)"
source: "[
Eiffel Software
5949 Hollister Ave., Goleta, CA 93117 USA
Telephone 805-685-1006, Fax 805-685-6869
Website http://www.eiffel.com
Customer support http://support.eiffel.com
]"
end