source: josm/trunk/src/com/google/gdata/util/common/base/UnicodeEscaper.java@ 9514

Last change on this file since 9514 was 4231, checked in by stoecker, 13 years ago

add signpost and metadata extractor code to repository directly

File size: 19.1 KB
Line 
1/* Copyright (c) 2008 Google Inc.
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16
17package com.google.gdata.util.common.base;
18
19import static com.google.gdata.util.common.base.Preconditions.checkNotNull;
20
21import java.io.IOException;
22
23/**
24 * An {@link Escaper} that converts literal text into a format safe for
25 * inclusion in a particular context (such as an XML document). Typically (but
26 * not always), the inverse process of "unescaping" the text is performed
27 * automatically by the relevant parser.
28 *
29 * <p>For example, an XML escaper would convert the literal string {@code
30 * "Foo<Bar>"} into {@code "Foo&lt;Bar&gt;"} to prevent {@code "<Bar>"} from
31 * being confused with an XML tag. When the resulting XML document is parsed,
32 * the parser API will return this text as the original literal string {@code
33 * "Foo<Bar>"}.
34 *
35 * <p><b>Note:</b> This class is similar to {@link CharEscaper} but with one
36 * very important difference. A CharEscaper can only process Java
37 * <a href="http://en.wikipedia.org/wiki/UTF-16">UTF16</a> characters in
38 * isolation and may not cope when it encounters surrogate pairs. This class
39 * facilitates the correct escaping of all Unicode characters.
40 *
41 * <p>As there are important reasons, including potential security issues, to
42 * handle Unicode correctly if you are considering implementing a new escaper
43 * you should favor using UnicodeEscaper wherever possible.
44 *
45 * <p>A {@code UnicodeEscaper} instance is required to be stateless, and safe
46 * when used concurrently by multiple threads.
47 *
48 * <p>Several popular escapers are defined as constants in the class {@link
49 * CharEscapers}. To create your own escapers extend this class and implement
50 * the {@link #escape(int)} method.
51 *
52 *
53 */
54public abstract class UnicodeEscaper implements Escaper {
55 /** The amount of padding (chars) to use when growing the escape buffer. */
56 private static final int DEST_PAD = 32;
57
58 /**
59 * Returns the escaped form of the given Unicode code point, or {@code null}
60 * if this code point does not need to be escaped. When called as part of an
61 * escaping operation, the given code point is guaranteed to be in the range
62 * {@code 0 <= cp <= Character#MAX_CODE_POINT}.
63 *
64 * <p>If an empty array is returned, this effectively strips the input
65 * character from the resulting text.
66 *
67 * <p>If the character does not need to be escaped, this method should return
68 * {@code null}, rather than an array containing the character representation
69 * of the code point. This enables the escaping algorithm to perform more
70 * efficiently.
71 *
72 * <p>If the implementation of this method cannot correctly handle a
73 * particular code point then it should either throw an appropriate runtime
74 * exception or return a suitable replacement character. It must never
75 * silently discard invalid input as this may constitute a security risk.
76 *
77 * @param cp the Unicode code point to escape if necessary
78 * @return the replacement characters, or {@code null} if no escaping was
79 * needed
80 */
81 protected abstract char[] escape(int cp);
82
83 /**
84 * Scans a sub-sequence of characters from a given {@link CharSequence},
85 * returning the index of the next character that requires escaping.
86 *
87 * <p><b>Note:</b> When implementing an escaper, it is a good idea to override
88 * this method for efficiency. The base class implementation determines
89 * successive Unicode code points and invokes {@link #escape(int)} for each of
90 * them. If the semantics of your escaper are such that code points in the
91 * supplementary range are either all escaped or all unescaped, this method
92 * can be implemented more efficiently using {@link CharSequence#charAt(int)}.
93 *
94 * <p>Note however that if your escaper does not escape characters in the
95 * supplementary range, you should either continue to validate the correctness
96 * of any surrogate characters encountered or provide a clear warning to users
97 * that your escaper does not validate its input.
98 *
99 * <p>See {@link PercentEscaper} for an example.
100 *
101 * @param csq a sequence of characters
102 * @param start the index of the first character to be scanned
103 * @param end the index immediately after the last character to be scanned
104 * @throws IllegalArgumentException if the scanned sub-sequence of {@code csq}
105 * contains invalid surrogate pairs
106 */
107 protected int nextEscapeIndex(CharSequence csq, int start, int end) {
108 int index = start;
109 while (index < end) {
110 int cp = codePointAt(csq, index, end);
111 if (cp < 0 || escape(cp) != null) {
112 break;
113 }
114 index += Character.isSupplementaryCodePoint(cp) ? 2 : 1;
115 }
116 return index;
117 }
118
119 /**
120 * Returns the escaped form of a given literal string.
121 *
122 * <p>If you are escaping input in arbitrary successive chunks, then it is not
123 * generally safe to use this method. If an input string ends with an
124 * unmatched high surrogate character, then this method will throw
125 * {@link IllegalArgumentException}. You should either ensure your input is
126 * valid <a href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a> before
127 * calling this method or use an escaped {@link Appendable} (as returned by
128 * {@link #escape(Appendable)}) which can cope with arbitrarily split input.
129 *
130 * <p><b>Note:</b> When implementing an escaper it is a good idea to override
131 * this method for efficiency by inlining the implementation of
132 * {@link #nextEscapeIndex(CharSequence, int, int)} directly. Doing this for
133 * {@link PercentEscaper} more than doubled the performance for unescaped
134 * strings (as measured by {@link CharEscapersBenchmark}).
135 *
136 * @param string the literal string to be escaped
137 * @return the escaped form of {@code string}
138 * @throws NullPointerException if {@code string} is null
139 * @throws IllegalArgumentException if invalid surrogate characters are
140 * encountered
141 */
142 public String escape(String string) {
143 int end = string.length();
144 int index = nextEscapeIndex(string, 0, end);
145 return index == end ? string : escapeSlow(string, index);
146 }
147
148 /**
149 * Returns the escaped form of a given literal string, starting at the given
150 * index. This method is called by the {@link #escape(String)} method when it
151 * discovers that escaping is required. It is protected to allow subclasses
152 * to override the fastpath escaping function to inline their escaping test.
153 * See {@link CharEscaperBuilder} for an example usage.
154 *
155 * <p>This method is not reentrant and may only be invoked by the top level
156 * {@link #escape(String)} method.
157 *
158 * @param s the literal string to be escaped
159 * @param index the index to start escaping from
160 * @return the escaped form of {@code string}
161 * @throws NullPointerException if {@code string} is null
162 * @throws IllegalArgumentException if invalid surrogate characters are
163 * encountered
164 */
165 protected final String escapeSlow(String s, int index) {
166 int end = s.length();
167
168 // Get a destination buffer and setup some loop variables.
169 char[] dest = DEST_TL.get();
170 int destIndex = 0;
171 int unescapedChunkStart = 0;
172
173 while (index < end) {
174 int cp = codePointAt(s, index, end);
175 if (cp < 0) {
176 throw new IllegalArgumentException(
177 "Trailing high surrogate at end of input");
178 }
179 char[] escaped = escape(cp);
180 if (escaped != null) {
181 int charsSkipped = index - unescapedChunkStart;
182
183 // This is the size needed to add the replacement, not the full
184 // size needed by the string. We only regrow when we absolutely must.
185 int sizeNeeded = destIndex + charsSkipped + escaped.length;
186 if (dest.length < sizeNeeded) {
187 int destLength = sizeNeeded + (end - index) + DEST_PAD;
188 dest = growBuffer(dest, destIndex, destLength);
189 }
190 // If we have skipped any characters, we need to copy them now.
191 if (charsSkipped > 0) {
192 s.getChars(unescapedChunkStart, index, dest, destIndex);
193 destIndex += charsSkipped;
194 }
195 if (escaped.length > 0) {
196 System.arraycopy(escaped, 0, dest, destIndex, escaped.length);
197 destIndex += escaped.length;
198 }
199 }
200 unescapedChunkStart
201 = index + (Character.isSupplementaryCodePoint(cp) ? 2 : 1);
202 index = nextEscapeIndex(s, unescapedChunkStart, end);
203 }
204
205 // Process trailing unescaped characters - no need to account for escaped
206 // length or padding the allocation.
207 int charsSkipped = end - unescapedChunkStart;
208 if (charsSkipped > 0) {
209 int endIndex = destIndex + charsSkipped;
210 if (dest.length < endIndex) {
211 dest = growBuffer(dest, destIndex, endIndex);
212 }
213 s.getChars(unescapedChunkStart, end, dest, destIndex);
214 destIndex = endIndex;
215 }
216 return new String(dest, 0, destIndex);
217 }
218
219 /**
220 * Returns an {@code Appendable} instance which automatically escapes all
221 * text appended to it before passing the resulting text to an underlying
222 * {@code Appendable}.
223 *
224 * <p>Unlike {@link #escape(String)} it is permitted to append arbitrarily
225 * split input to this Appendable, including input that is split over a
226 * surrogate pair. In this case the pending high surrogate character will not
227 * be processed until the corresponding low surrogate is appended. This means
228 * that a trailing high surrogate character at the end of the input cannot be
229 * detected and will be silently ignored. This is unavoidable since the
230 * Appendable interface has no {@code close()} method, and it is impossible to
231 * determine when the last characters have been appended.
232 *
233 * <p>The methods of the returned object will propagate any exceptions thrown
234 * by the underlying {@code Appendable}.
235 *
236 * <p>For well formed <a href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a>
237 * the escaping behavior is identical to that of {@link #escape(String)} and
238 * the following code is equivalent to (but much slower than)
239 * {@code escaper.escape(string)}: <pre>{@code
240 *
241 * StringBuilder sb = new StringBuilder();
242 * escaper.escape(sb).append(string);
243 * return sb.toString();}</pre>
244 *
245 * @param out the underlying {@code Appendable} to append escaped output to
246 * @return an {@code Appendable} which passes text to {@code out} after
247 * escaping it
248 * @throws NullPointerException if {@code out} is null
249 * @throws IllegalArgumentException if invalid surrogate characters are
250 * encountered
251 *
252 */
253 public Appendable escape(final Appendable out) {
254 checkNotNull(out);
255
256 return new Appendable() {
257 int pendingHighSurrogate = -1;
258 char[] decodedChars = new char[2];
259
260 public Appendable append(CharSequence csq) throws IOException {
261 return append(csq, 0, csq.length());
262 }
263
264 public Appendable append(CharSequence csq, int start, int end)
265 throws IOException {
266 int index = start;
267 if (index < end) {
268 // This is a little subtle: index must never reference the middle of a
269 // surrogate pair but unescapedChunkStart can. The first time we enter
270 // the loop below it is possible that index != unescapedChunkStart.
271 int unescapedChunkStart = index;
272 if (pendingHighSurrogate != -1) {
273 // Our last append operation ended halfway through a surrogate pair
274 // so we have to do some extra work first.
275 char c = csq.charAt(index++);
276 if (!Character.isLowSurrogate(c)) {
277 throw new IllegalArgumentException(
278 "Expected low surrogate character but got " + c);
279 }
280 char[] escaped =
281 escape(Character.toCodePoint((char) pendingHighSurrogate, c));
282 if (escaped != null) {
283 // Emit the escaped character and adjust unescapedChunkStart to
284 // skip the low surrogate we have consumed.
285 outputChars(escaped, escaped.length);
286 unescapedChunkStart += 1;
287 } else {
288 // Emit pending high surrogate (unescaped) but do not modify
289 // unescapedChunkStart as we must still emit the low surrogate.
290 out.append((char) pendingHighSurrogate);
291 }
292 pendingHighSurrogate = -1;
293 }
294 while (true) {
295 // Find and append the next subsequence of unescaped characters.
296 index = nextEscapeIndex(csq, index, end);
297 if (index > unescapedChunkStart) {
298 out.append(csq, unescapedChunkStart, index);
299 }
300 if (index == end) {
301 break;
302 }
303 // If we are not finished, calculate the next code point.
304 int cp = codePointAt(csq, index, end);
305 if (cp < 0) {
306 // Our sequence ended half way through a surrogate pair so just
307 // record the state and exit.
308 pendingHighSurrogate = -cp;
309 break;
310 }
311 // Escape the code point and output the characters.
312 char[] escaped = escape(cp);
313 if (escaped != null) {
314 outputChars(escaped, escaped.length);
315 } else {
316 // This shouldn't really happen if nextEscapeIndex is correct but
317 // we should cope with false positives.
318 int len = Character.toChars(cp, decodedChars, 0);
319 outputChars(decodedChars, len);
320 }
321 // Update our index past the escaped character and continue.
322 index += (Character.isSupplementaryCodePoint(cp) ? 2 : 1);
323 unescapedChunkStart = index;
324 }
325 }
326 return this;
327 }
328
329 public Appendable append(char c) throws IOException {
330 if (pendingHighSurrogate != -1) {
331 // Our last append operation ended halfway through a surrogate pair
332 // so we have to do some extra work first.
333 if (!Character.isLowSurrogate(c)) {
334 throw new IllegalArgumentException(
335 "Expected low surrogate character but got '" + c +
336 "' with value " + (int) c);
337 }
338 char[] escaped =
339 escape(Character.toCodePoint((char) pendingHighSurrogate, c));
340 if (escaped != null) {
341 outputChars(escaped, escaped.length);
342 } else {
343 out.append((char) pendingHighSurrogate);
344 out.append(c);
345 }
346 pendingHighSurrogate = -1;
347 } else if (Character.isHighSurrogate(c)) {
348 // This is the start of a (split) surrogate pair.
349 pendingHighSurrogate = c;
350 } else {
351 if (Character.isLowSurrogate(c)) {
352 throw new IllegalArgumentException(
353 "Unexpected low surrogate character '" + c +
354 "' with value " + (int) c);
355 }
356 // This is a normal (non surrogate) char.
357 char[] escaped = escape(c);
358 if (escaped != null) {
359 outputChars(escaped, escaped.length);
360 } else {
361 out.append(c);
362 }
363 }
364 return this;
365 }
366
367 private void outputChars(char[] chars, int len) throws IOException {
368 for (int n = 0; n < len; n++) {
369 out.append(chars[n]);
370 }
371 }
372 };
373 }
374
375 /**
376 * Returns the Unicode code point of the character at the given index.
377 *
378 * <p>Unlike {@link Character#codePointAt(CharSequence, int)} or
379 * {@link String#codePointAt(int)} this method will never fail silently when
380 * encountering an invalid surrogate pair.
381 *
382 * <p>The behaviour of this method is as follows:
383 * <ol>
384 * <li>If {@code index >= end}, {@link IndexOutOfBoundsException} is thrown.
385 * <li><b>If the character at the specified index is not a surrogate, it is
386 * returned.</b>
387 * <li>If the first character was a high surrogate value, then an attempt is
388 * made to read the next character.
389 * <ol>
390 * <li><b>If the end of the sequence was reached, the negated value of
391 * the trailing high surrogate is returned.</b>
392 * <li><b>If the next character was a valid low surrogate, the code point
393 * value of the high/low surrogate pair is returned.</b>
394 * <li>If the next character was not a low surrogate value, then
395 * {@link IllegalArgumentException} is thrown.
396 * </ol>
397 * <li>If the first character was a low surrogate value,
398 * {@link IllegalArgumentException} is thrown.
399 * </ol>
400 *
401 * @param seq the sequence of characters from which to decode the code point
402 * @param index the index of the first character to decode
403 * @param end the index beyond the last valid character to decode
404 * @return the Unicode code point for the given index or the negated value of
405 * the trailing high surrogate character at the end of the sequence
406 */
407 protected static final int codePointAt(CharSequence seq, int index, int end) {
408 if (index < end) {
409 char c1 = seq.charAt(index++);
410 if (c1 < Character.MIN_HIGH_SURROGATE ||
411 c1 > Character.MAX_LOW_SURROGATE) {
412 // Fast path (first test is probably all we need to do)
413 return c1;
414 } else if (c1 <= Character.MAX_HIGH_SURROGATE) {
415 // If the high surrogate was the last character, return its inverse
416 if (index == end) {
417 return -c1;
418 }
419 // Otherwise look for the low surrogate following it
420 char c2 = seq.charAt(index);
421 if (Character.isLowSurrogate(c2)) {
422 return Character.toCodePoint(c1, c2);
423 }
424 throw new IllegalArgumentException(
425 "Expected low surrogate but got char '" + c2 +
426 "' with value " + (int) c2 + " at index " + index);
427 } else {
428 throw new IllegalArgumentException(
429 "Unexpected low surrogate character '" + c1 +
430 "' with value " + (int) c1 + " at index " + (index - 1));
431 }
432 }
433 throw new IndexOutOfBoundsException("Index exceeds specified range");
434 }
435
436 /**
437 * Helper method to grow the character buffer as needed, this only happens
438 * once in a while so it's ok if it's in a method call. If the index passed
439 * in is 0 then no copying will be done.
440 */
441 private static final char[] growBuffer(char[] dest, int index, int size) {
442 char[] copy = new char[size];
443 if (index > 0) {
444 System.arraycopy(dest, 0, copy, 0, index);
445 }
446 return copy;
447 }
448
449 /**
450 * A thread-local destination buffer to keep us from creating new buffers.
451 * The starting size is 1024 characters. If we grow past this we don't
452 * put it back in the threadlocal, we just keep going and grow as needed.
453 */
454 private static final ThreadLocal<char[]> DEST_TL = new ThreadLocal<char[]>() {
455 @Override
456 protected char[] initialValue() {
457 return new char[1024];
458 }
459 };
460}
Note: See TracBrowser for help on using the repository browser.