Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: josm/trunk/src/com/google/gdata/util/common/base/UnicodeEscaper.java@ 9514

Last change on this file since 9514 was 4231, checked in by stoecker, 14 years ago
add signpost and metadata extractor code to repository directly
File size: 19.1 KB

Line
1	/* Copyright (c) 2008 Google Inc.
2	*
3	* Licensed under the Apache License, Version 2.0 (the "License");
4	* you may not use this file except in compliance with the License.
5	* You may obtain a copy of the License at
6	*
7	* http://www.apache.org/licenses/LICENSE-2.0
8	*
9	* Unless required by applicable law or agreed to in writing, software
10	* distributed under the License is distributed on an "AS IS" BASIS,
11	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	* See the License for the specific language governing permissions and
13	* limitations under the License.
14	*/
15
16
17	package com.google.gdata.util.common.base;
18
19	import static com.google.gdata.util.common.base.Preconditions.checkNotNull;
20
21	import java.io.IOException;
22
23	/**
24	* An {@link Escaper} that converts literal text into a format safe for
25	* inclusion in a particular context (such as an XML document). Typically (but
26	* not always), the inverse process of "unescaping" the text is performed
27	* automatically by the relevant parser.
28	*
29	* <p>For example, an XML escaper would convert the literal string {@code
30	* "Foo<Bar>"} into {@code "Foo<Bar>"} to prevent {@code "<Bar>"} from
31	* being confused with an XML tag. When the resulting XML document is parsed,
32	* the parser API will return this text as the original literal string {@code
33	* "Foo<Bar>"}.
34	*
35	* <p><b>Note:</b> This class is similar to {@link CharEscaper} but with one
36	* very important difference. A CharEscaper can only process Java
37	* <a href="http://en.wikipedia.org/wiki/UTF-16">UTF16</a> characters in
38	* isolation and may not cope when it encounters surrogate pairs. This class
39	* facilitates the correct escaping of all Unicode characters.
40	*
41	* <p>As there are important reasons, including potential security issues, to
42	* handle Unicode correctly if you are considering implementing a new escaper
43	* you should favor using UnicodeEscaper wherever possible.
44	*
45	* <p>A {@code UnicodeEscaper} instance is required to be stateless, and safe
46	* when used concurrently by multiple threads.
47	*
48	* <p>Several popular escapers are defined as constants in the class {@link
49	* CharEscapers}. To create your own escapers extend this class and implement
50	* the {@link #escape(int)} method.
51	*
52	*
53	*/
54	public abstract class UnicodeEscaper implements Escaper {
55	/** The amount of padding (chars) to use when growing the escape buffer. */
56	private static final int DEST_PAD = 32;
57
58	/**
59	* Returns the escaped form of the given Unicode code point, or {@code null}
60	* if this code point does not need to be escaped. When called as part of an
61	* escaping operation, the given code point is guaranteed to be in the range
62	* {@code 0 <= cp <= Character#MAX_CODE_POINT}.
63	*
64	* <p>If an empty array is returned, this effectively strips the input
65	* character from the resulting text.
66	*
67	* <p>If the character does not need to be escaped, this method should return
68	* {@code null}, rather than an array containing the character representation
69	* of the code point. This enables the escaping algorithm to perform more
70	* efficiently.
71	*
72	* <p>If the implementation of this method cannot correctly handle a
73	* particular code point then it should either throw an appropriate runtime
74	* exception or return a suitable replacement character. It must never
75	* silently discard invalid input as this may constitute a security risk.
76	*
77	* @param cp the Unicode code point to escape if necessary
78	* @return the replacement characters, or {@code null} if no escaping was
79	* needed
80	*/
81	protected abstract char[] escape(int cp);
82
83	/**
84	* Scans a sub-sequence of characters from a given {@link CharSequence},
85	* returning the index of the next character that requires escaping.
86	*
87	* <p><b>Note:</b> When implementing an escaper, it is a good idea to override
88	* this method for efficiency. The base class implementation determines
89	* successive Unicode code points and invokes {@link #escape(int)} for each of
90	* them. If the semantics of your escaper are such that code points in the
91	* supplementary range are either all escaped or all unescaped, this method
92	* can be implemented more efficiently using {@link CharSequence#charAt(int)}.
93	*
94	* <p>Note however that if your escaper does not escape characters in the
95	* supplementary range, you should either continue to validate the correctness
96	* of any surrogate characters encountered or provide a clear warning to users
97	* that your escaper does not validate its input.
98	*
99	* <p>See {@link PercentEscaper} for an example.
100	*
101	* @param csq a sequence of characters
102	* @param start the index of the first character to be scanned
103	* @param end the index immediately after the last character to be scanned
104	* @throws IllegalArgumentException if the scanned sub-sequence of {@code csq}
105	* contains invalid surrogate pairs
106	*/
107	protected int nextEscapeIndex(CharSequence csq, int start, int end) {
108	int index = start;
109	while (index < end) {
110	int cp = codePointAt(csq, index, end);
111	if (cp < 0 \|\| escape(cp) != null) {
112	break;
113	}
114	index += Character.isSupplementaryCodePoint(cp) ? 2 : 1;
115	}
116	return index;
117	}
118
119	/**
120	* Returns the escaped form of a given literal string.
121	*
122	* <p>If you are escaping input in arbitrary successive chunks, then it is not
123	* generally safe to use this method. If an input string ends with an
124	* unmatched high surrogate character, then this method will throw
125	* {@link IllegalArgumentException}. You should either ensure your input is
126	* valid <a href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a> before
127	* calling this method or use an escaped {@link Appendable} (as returned by
128	* {@link #escape(Appendable)}) which can cope with arbitrarily split input.
129	*
130	* <p><b>Note:</b> When implementing an escaper it is a good idea to override
131	* this method for efficiency by inlining the implementation of
132	* {@link #nextEscapeIndex(CharSequence, int, int)} directly. Doing this for
133	* {@link PercentEscaper} more than doubled the performance for unescaped
134	* strings (as measured by {@link CharEscapersBenchmark}).
135	*
136	* @param string the literal string to be escaped
137	* @return the escaped form of {@code string}
138	* @throws NullPointerException if {@code string} is null
139	* @throws IllegalArgumentException if invalid surrogate characters are
140	* encountered
141	*/
142	public String escape(String string) {
143	int end = string.length();
144	int index = nextEscapeIndex(string, 0, end);
145	return index == end ? string : escapeSlow(string, index);
146	}
147
148	/**
149	* Returns the escaped form of a given literal string, starting at the given
150	* index. This method is called by the {@link #escape(String)} method when it
151	* discovers that escaping is required. It is protected to allow subclasses
152	* to override the fastpath escaping function to inline their escaping test.
153	* See {@link CharEscaperBuilder} for an example usage.
154	*
155	* <p>This method is not reentrant and may only be invoked by the top level
156	* {@link #escape(String)} method.
157	*
158	* @param s the literal string to be escaped
159	* @param index the index to start escaping from
160	* @return the escaped form of {@code string}
161	* @throws NullPointerException if {@code string} is null
162	* @throws IllegalArgumentException if invalid surrogate characters are
163	* encountered
164	*/
165	protected final String escapeSlow(String s, int index) {
166	int end = s.length();
167
168	// Get a destination buffer and setup some loop variables.
169	char[] dest = DEST_TL.get();
170	int destIndex = 0;
171	int unescapedChunkStart = 0;
172
173	while (index < end) {
174	int cp = codePointAt(s, index, end);
175	if (cp < 0) {
176	throw new IllegalArgumentException(
177	"Trailing high surrogate at end of input");
178	}
179	char[] escaped = escape(cp);
180	if (escaped != null) {
181	int charsSkipped = index - unescapedChunkStart;
182
183	// This is the size needed to add the replacement, not the full
184	// size needed by the string. We only regrow when we absolutely must.
185	int sizeNeeded = destIndex + charsSkipped + escaped.length;
186	if (dest.length < sizeNeeded) {
187	int destLength = sizeNeeded + (end - index) + DEST_PAD;
188	dest = growBuffer(dest, destIndex, destLength);
189	}
190	// If we have skipped any characters, we need to copy them now.
191	if (charsSkipped > 0) {
192	s.getChars(unescapedChunkStart, index, dest, destIndex);
193	destIndex += charsSkipped;
194	}
195	if (escaped.length > 0) {
196	System.arraycopy(escaped, 0, dest, destIndex, escaped.length);
197	destIndex += escaped.length;
198	}
199	}
200	unescapedChunkStart
201	= index + (Character.isSupplementaryCodePoint(cp) ? 2 : 1);
202	index = nextEscapeIndex(s, unescapedChunkStart, end);
203	}
204
205	// Process trailing unescaped characters - no need to account for escaped
206	// length or padding the allocation.
207	int charsSkipped = end - unescapedChunkStart;
208	if (charsSkipped > 0) {
209	int endIndex = destIndex + charsSkipped;
210	if (dest.length < endIndex) {
211	dest = growBuffer(dest, destIndex, endIndex);
212	}
213	s.getChars(unescapedChunkStart, end, dest, destIndex);
214	destIndex = endIndex;
215	}
216	return new String(dest, 0, destIndex);
217	}
218
219	/**
220	* Returns an {@code Appendable} instance which automatically escapes all
221	* text appended to it before passing the resulting text to an underlying
222	* {@code Appendable}.
223	*
224	* <p>Unlike {@link #escape(String)} it is permitted to append arbitrarily
225	* split input to this Appendable, including input that is split over a
226	* surrogate pair. In this case the pending high surrogate character will not
227	* be processed until the corresponding low surrogate is appended. This means
228	* that a trailing high surrogate character at the end of the input cannot be
229	* detected and will be silently ignored. This is unavoidable since the
230	* Appendable interface has no {@code close()} method, and it is impossible to
231	* determine when the last characters have been appended.
232	*
233	* <p>The methods of the returned object will propagate any exceptions thrown
234	* by the underlying {@code Appendable}.
235	*
236	* <p>For well formed <a href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a>
237	* the escaping behavior is identical to that of {@link #escape(String)} and
238	* the following code is equivalent to (but much slower than)
239	* {@code escaper.escape(string)}: <pre>{@code
240	*
241	* StringBuilder sb = new StringBuilder();
242	* escaper.escape(sb).append(string);
243	* return sb.toString();}</pre>
244	*
245	* @param out the underlying {@code Appendable} to append escaped output to
246	* @return an {@code Appendable} which passes text to {@code out} after
247	* escaping it
248	* @throws NullPointerException if {@code out} is null
249	* @throws IllegalArgumentException if invalid surrogate characters are
250	* encountered
251	*
252	*/
253	public Appendable escape(final Appendable out) {
254	checkNotNull(out);
255
256	return new Appendable() {
257	int pendingHighSurrogate = -1;
258	char[] decodedChars = new char[2];
259
260	public Appendable append(CharSequence csq) throws IOException {
261	return append(csq, 0, csq.length());
262	}
263
264	public Appendable append(CharSequence csq, int start, int end)
265	throws IOException {
266	int index = start;
267	if (index < end) {
268	// This is a little subtle: index must never reference the middle of a
269	// surrogate pair but unescapedChunkStart can. The first time we enter
270	// the loop below it is possible that index != unescapedChunkStart.
271	int unescapedChunkStart = index;
272	if (pendingHighSurrogate != -1) {
273	// Our last append operation ended halfway through a surrogate pair
274	// so we have to do some extra work first.
275	char c = csq.charAt(index++);
276	if (!Character.isLowSurrogate(c)) {
277	throw new IllegalArgumentException(
278	"Expected low surrogate character but got " + c);
279	}
280	char[] escaped =
281	escape(Character.toCodePoint((char) pendingHighSurrogate, c));
282	if (escaped != null) {
283	// Emit the escaped character and adjust unescapedChunkStart to
284	// skip the low surrogate we have consumed.
285	outputChars(escaped, escaped.length);
286	unescapedChunkStart += 1;
287	} else {
288	// Emit pending high surrogate (unescaped) but do not modify
289	// unescapedChunkStart as we must still emit the low surrogate.
290	out.append((char) pendingHighSurrogate);
291	}
292	pendingHighSurrogate = -1;
293	}
294	while (true) {
295	// Find and append the next subsequence of unescaped characters.
296	index = nextEscapeIndex(csq, index, end);
297	if (index > unescapedChunkStart) {
298	out.append(csq, unescapedChunkStart, index);
299	}
300	if (index == end) {
301	break;
302	}
303	// If we are not finished, calculate the next code point.
304	int cp = codePointAt(csq, index, end);
305	if (cp < 0) {
306	// Our sequence ended half way through a surrogate pair so just
307	// record the state and exit.
308	pendingHighSurrogate = -cp;
309	break;
310	}
311	// Escape the code point and output the characters.
312	char[] escaped = escape(cp);
313	if (escaped != null) {
314	outputChars(escaped, escaped.length);
315	} else {
316	// This shouldn't really happen if nextEscapeIndex is correct but
317	// we should cope with false positives.
318	int len = Character.toChars(cp, decodedChars, 0);
319	outputChars(decodedChars, len);
320	}
321	// Update our index past the escaped character and continue.
322	index += (Character.isSupplementaryCodePoint(cp) ? 2 : 1);
323	unescapedChunkStart = index;
324	}
325	}
326	return this;
327	}
328
329	public Appendable append(char c) throws IOException {
330	if (pendingHighSurrogate != -1) {
331	// Our last append operation ended halfway through a surrogate pair
332	// so we have to do some extra work first.
333	if (!Character.isLowSurrogate(c)) {
334	throw new IllegalArgumentException(
335	"Expected low surrogate character but got '" + c +
336	"' with value " + (int) c);
337	}
338	char[] escaped =
339	escape(Character.toCodePoint((char) pendingHighSurrogate, c));
340	if (escaped != null) {
341	outputChars(escaped, escaped.length);
342	} else {
343	out.append((char) pendingHighSurrogate);
344	out.append(c);
345	}
346	pendingHighSurrogate = -1;
347	} else if (Character.isHighSurrogate(c)) {
348	// This is the start of a (split) surrogate pair.
349	pendingHighSurrogate = c;
350	} else {
351	if (Character.isLowSurrogate(c)) {
352	throw new IllegalArgumentException(
353	"Unexpected low surrogate character '" + c +
354	"' with value " + (int) c);
355	}
356	// This is a normal (non surrogate) char.
357	char[] escaped = escape(c);
358	if (escaped != null) {
359	outputChars(escaped, escaped.length);
360	} else {
361	out.append(c);
362	}
363	}
364	return this;
365	}
366
367	private void outputChars(char[] chars, int len) throws IOException {
368	for (int n = 0; n < len; n++) {
369	out.append(chars[n]);
370	}
371	}
372	};
373	}
374
375	/**
376	* Returns the Unicode code point of the character at the given index.
377	*
378	* <p>Unlike {@link Character#codePointAt(CharSequence, int)} or
379	* {@link String#codePointAt(int)} this method will never fail silently when
380	* encountering an invalid surrogate pair.
381	*
382	* <p>The behaviour of this method is as follows:
383	* <ol>
384	* <li>If {@code index >= end}, {@link IndexOutOfBoundsException} is thrown.
385	* <li><b>If the character at the specified index is not a surrogate, it is
386	* returned.</b>
387	* <li>If the first character was a high surrogate value, then an attempt is
388	* made to read the next character.
389	* <ol>
390	* <li><b>If the end of the sequence was reached, the negated value of
391	* the trailing high surrogate is returned.</b>
392	* <li><b>If the next character was a valid low surrogate, the code point
393	* value of the high/low surrogate pair is returned.</b>
394	* <li>If the next character was not a low surrogate value, then
395	* {@link IllegalArgumentException} is thrown.
396	* </ol>
397	* <li>If the first character was a low surrogate value,
398	* {@link IllegalArgumentException} is thrown.
399	* </ol>
400	*
401	* @param seq the sequence of characters from which to decode the code point
402	* @param index the index of the first character to decode
403	* @param end the index beyond the last valid character to decode
404	* @return the Unicode code point for the given index or the negated value of
405	* the trailing high surrogate character at the end of the sequence
406	*/
407	protected static final int codePointAt(CharSequence seq, int index, int end) {
408	if (index < end) {
409	char c1 = seq.charAt(index++);
410	if (c1 < Character.MIN_HIGH_SURROGATE \|\|
411	c1 > Character.MAX_LOW_SURROGATE) {
412	// Fast path (first test is probably all we need to do)
413	return c1;
414	} else if (c1 <= Character.MAX_HIGH_SURROGATE) {
415	// If the high surrogate was the last character, return its inverse
416	if (index == end) {
417	return -c1;
418	}
419	// Otherwise look for the low surrogate following it
420	char c2 = seq.charAt(index);
421	if (Character.isLowSurrogate(c2)) {
422	return Character.toCodePoint(c1, c2);
423	}
424	throw new IllegalArgumentException(
425	"Expected low surrogate but got char '" + c2 +
426	"' with value " + (int) c2 + " at index " + index);
427	} else {
428	throw new IllegalArgumentException(
429	"Unexpected low surrogate character '" + c1 +
430	"' with value " + (int) c1 + " at index " + (index - 1));
431	}
432	}
433	throw new IndexOutOfBoundsException("Index exceeds specified range");
434	}
435
436	/**
437	* Helper method to grow the character buffer as needed, this only happens
438	* once in a while so it's ok if it's in a method call. If the index passed
439	* in is 0 then no copying will be done.
440	*/
441	private static final char[] growBuffer(char[] dest, int index, int size) {
442	char[] copy = new char[size];
443	if (index > 0) {
444	System.arraycopy(dest, 0, copy, 0, index);
445	}
446	return copy;
447	}
448
449	/**
450	* A thread-local destination buffer to keep us from creating new buffers.
451	* The starting size is 1024 characters. If we grow past this we don't
452	* put it back in the threadlocal, we just keep going and grow as needed.
453	*/
454	private static final ThreadLocal<char[]> DEST_TL = new ThreadLocal<char[]>() {
455	@Override
456	protected char[] initialValue() {
457	return new char[1024];
458	}
459	};
460	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: