source: josm/trunk/src/com/google/gdata/util/common/base/PercentEscaper.java@ 4231

Last change on this file since 4231 was 4231, checked in by stoecker, 13 years ago

add signpost and metadata extractor code to repository directly

File size: 9.7 KB
Line 
1/* Copyright (c) 2008 Google Inc.
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16
17package com.google.gdata.util.common.base;
18
19/**
20 * A {@code UnicodeEscaper} that escapes some set of Java characters using
21 * the URI percent encoding scheme. The set of safe characters (those which
22 * remain unescaped) can be specified on construction.
23 *
24 * <p>For details on escaping URIs for use in web pages, see section 2.4 of
25 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
26 *
27 * <p>In most cases this class should not need to be used directly. If you
28 * have no special requirements for escaping your URIs, you should use either
29 * {@link CharEscapers#uriEscaper()} or
30 * {@link CharEscapers#uriEscaper(boolean)}.
31 *
32 * <p>When encoding a String, the following rules apply:
33 * <ul>
34 * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
35 * through "9" remain the same.
36 * <li>Any additionally specified safe characters remain the same.
37 * <li>If {@code plusForSpace} was specified, the space character " " is
38 * converted into a plus sign "+".
39 * <li>All other characters are converted into one or more bytes using UTF-8
40 * encoding and each byte is then represented by the 3-character string
41 * "%XY", where "XY" is the two-digit, uppercase, hexadecimal representation
42 * of the byte value.
43 * </ul>
44 *
45 * <p>RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!",
46 * "~", "*", "'", "(" and ")". It goes on to state:
47 *
48 * <p><i>Unreserved characters can be escaped without changing the semantics
49 * of the URI, but this should not be done unless the URI is being used
50 * in a context that does not allow the unescaped character to appear.</i>
51 *
52 * <p>For performance reasons the only currently supported character encoding of
53 * this class is UTF-8.
54 *
55 * <p><b>Note</b>: This escaper produces uppercase hexidecimal sequences. From
56 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
57 * <i>"URI producers and normalizers should use uppercase hexadecimal digits
58 * for all percent-encodings."</i>
59 *
60 *
61 */
62public class PercentEscaper extends UnicodeEscaper {
63 /**
64 * A string of safe characters that mimics the behavior of
65 * {@link java.net.URLEncoder}.
66 *
67 */
68 public static final String SAFECHARS_URLENCODER = "-_.*";
69
70 /**
71 * A string of characters that do not need to be encoded when used in URI
72 * path segments, as specified in RFC 3986. Note that some of these
73 * characters do need to be escaped when used in other parts of the URI.
74 */
75 public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;=";
76
77 /**
78 * A string of characters that do not need to be encoded when used in URI
79 * query strings, as specified in RFC 3986. Note that some of these
80 * characters do need to be escaped when used in other parts of the URI.
81 */
82 public static final String SAFEQUERYSTRINGCHARS_URLENCODER
83 = "-_.!~*'()@:$,;/?:";
84
85 // In some uri escapers spaces are escaped to '+'
86 private static final char[] URI_ESCAPED_SPACE = { '+' };
87
88 private static final char[] UPPER_HEX_DIGITS =
89 "0123456789ABCDEF".toCharArray();
90
91 /**
92 * If true we should convert space to the {@code +} character.
93 */
94 private final boolean plusForSpace;
95
96 /**
97 * An array of flags where for any {@code char c} if {@code safeOctets[c]} is
98 * true then {@code c} should remain unmodified in the output. If
99 * {@code c > safeOctets.length} then it should be escaped.
100 */
101 private final boolean[] safeOctets;
102
103 /**
104 * Constructs a URI escaper with the specified safe characters and optional
105 * handling of the space character.
106 *
107 * @param safeChars a non null string specifying additional safe characters
108 * for this escaper (the ranges 0..9, a..z and A..Z are always safe and
109 * should not be specified here)
110 * @param plusForSpace true if ASCII space should be escaped to {@code +}
111 * rather than {@code %20}
112 * @throws IllegalArgumentException if any of the parameters were invalid
113 */
114 public PercentEscaper(String safeChars, boolean plusForSpace) {
115 // Avoid any misunderstandings about the behavior of this escaper
116 if (safeChars.matches(".*[0-9A-Za-z].*")) {
117 throw new IllegalArgumentException(
118 "Alphanumeric characters are always 'safe' and should not be " +
119 "explicitly specified");
120 }
121 // Avoid ambiguous parameters. Safe characters are never modified so if
122 // space is a safe character then setting plusForSpace is meaningless.
123 if (plusForSpace && safeChars.contains(" ")) {
124 throw new IllegalArgumentException(
125 "plusForSpace cannot be specified when space is a 'safe' character");
126 }
127 if (safeChars.contains("%")) {
128 throw new IllegalArgumentException(
129 "The '%' character cannot be specified as 'safe'");
130 }
131 this.plusForSpace = plusForSpace;
132 this.safeOctets = createSafeOctets(safeChars);
133 }
134
135 /**
136 * Creates a boolean[] with entries corresponding to the character values
137 * for 0-9, A-Z, a-z and those specified in safeChars set to true. The array
138 * is as small as is required to hold the given character information.
139 */
140 private static boolean[] createSafeOctets(String safeChars) {
141 int maxChar = 'z';
142 char[] safeCharArray = safeChars.toCharArray();
143 for (char c : safeCharArray) {
144 maxChar = Math.max(c, maxChar);
145 }
146 boolean[] octets = new boolean[maxChar + 1];
147 for (int c = '0'; c <= '9'; c++) {
148 octets[c] = true;
149 }
150 for (int c = 'A'; c <= 'Z'; c++) {
151 octets[c] = true;
152 }
153 for (int c = 'a'; c <= 'z'; c++) {
154 octets[c] = true;
155 }
156 for (char c : safeCharArray) {
157 octets[c] = true;
158 }
159 return octets;
160 }
161
162 /*
163 * Overridden for performance. For unescaped strings this improved the
164 * performance of the uri escaper from ~760ns to ~400ns as measured by
165 * {@link CharEscapersBenchmark}.
166 */
167 @Override
168 protected int nextEscapeIndex(CharSequence csq, int index, int end) {
169 for (; index < end; index++) {
170 char c = csq.charAt(index);
171 if (c >= safeOctets.length || !safeOctets[c]) {
172 break;
173 }
174 }
175 return index;
176 }
177
178 /*
179 * Overridden for performance. For unescaped strings this improved the
180 * performance of the uri escaper from ~400ns to ~170ns as measured by
181 * {@link CharEscapersBenchmark}.
182 */
183 @Override
184 public String escape(String s) {
185 int slen = s.length();
186 for (int index = 0; index < slen; index++) {
187 char c = s.charAt(index);
188 if (c >= safeOctets.length || !safeOctets[c]) {
189 return escapeSlow(s, index);
190 }
191 }
192 return s;
193 }
194
195 /**
196 * Escapes the given Unicode code point in UTF-8.
197 */
198 @Override
199 protected char[] escape(int cp) {
200 // We should never get negative values here but if we do it will throw an
201 // IndexOutOfBoundsException, so at least it will get spotted.
202 if (cp < safeOctets.length && safeOctets[cp]) {
203 return null;
204 } else if (cp == ' ' && plusForSpace) {
205 return URI_ESCAPED_SPACE;
206 } else if (cp <= 0x7F) {
207 // Single byte UTF-8 characters
208 // Start with "%--" and fill in the blanks
209 char[] dest = new char[3];
210 dest[0] = '%';
211 dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
212 dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
213 return dest;
214 } else if (cp <= 0x7ff) {
215 // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
216 // Start with "%--%--" and fill in the blanks
217 char[] dest = new char[6];
218 dest[0] = '%';
219 dest[3] = '%';
220 dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
221 cp >>>= 4;
222 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
223 cp >>>= 2;
224 dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
225 cp >>>= 4;
226 dest[1] = UPPER_HEX_DIGITS[0xC | cp];
227 return dest;
228 } else if (cp <= 0xffff) {
229 // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
230 // Start with "%E-%--%--" and fill in the blanks
231 char[] dest = new char[9];
232 dest[0] = '%';
233 dest[1] = 'E';
234 dest[3] = '%';
235 dest[6] = '%';
236 dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
237 cp >>>= 4;
238 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
239 cp >>>= 2;
240 dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
241 cp >>>= 4;
242 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
243 cp >>>= 2;
244 dest[2] = UPPER_HEX_DIGITS[cp];
245 return dest;
246 } else if (cp <= 0x10ffff) {
247 char[] dest = new char[12];
248 // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
249 // Start with "%F-%--%--%--" and fill in the blanks
250 dest[0] = '%';
251 dest[1] = 'F';
252 dest[3] = '%';
253 dest[6] = '%';
254 dest[9] = '%';
255 dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
256 cp >>>= 4;
257 dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
258 cp >>>= 2;
259 dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
260 cp >>>= 4;
261 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
262 cp >>>= 2;
263 dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
264 cp >>>= 4;
265 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
266 cp >>>= 2;
267 dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
268 return dest;
269 } else {
270 // If this ever happens it is due to bug in UnicodeEscaper, not bad input.
271 throw new IllegalArgumentException(
272 "Invalid unicode character value " + cp);
273 }
274 }
275}
Note: See TracBrowser for help on using the repository browser.