| 1 | /* Copyright (c) 2008 Google Inc.
|
|---|
| 2 | *
|
|---|
| 3 | * Licensed under the Apache License, Version 2.0 (the "License");
|
|---|
| 4 | * you may not use this file except in compliance with the License.
|
|---|
| 5 | * You may obtain a copy of the License at
|
|---|
| 6 | *
|
|---|
| 7 | * http://www.apache.org/licenses/LICENSE-2.0
|
|---|
| 8 | *
|
|---|
| 9 | * Unless required by applicable law or agreed to in writing, software
|
|---|
| 10 | * distributed under the License is distributed on an "AS IS" BASIS,
|
|---|
| 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|---|
| 12 | * See the License for the specific language governing permissions and
|
|---|
| 13 | * limitations under the License.
|
|---|
| 14 | */
|
|---|
| 15 |
|
|---|
| 16 |
|
|---|
| 17 | package com.google.gdata.util.common.base;
|
|---|
| 18 |
|
|---|
| 19 | /**
|
|---|
| 20 | * A {@code UnicodeEscaper} that escapes some set of Java characters using
|
|---|
| 21 | * the URI percent encoding scheme. The set of safe characters (those which
|
|---|
| 22 | * remain unescaped) can be specified on construction.
|
|---|
| 23 | *
|
|---|
| 24 | * <p>For details on escaping URIs for use in web pages, see section 2.4 of
|
|---|
| 25 | * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
|
|---|
| 26 | *
|
|---|
| 27 | * <p>In most cases this class should not need to be used directly. If you
|
|---|
| 28 | * have no special requirements for escaping your URIs, you should use either
|
|---|
| 29 | * {@link CharEscapers#uriEscaper()} or
|
|---|
| 30 | * {@link CharEscapers#uriEscaper(boolean)}.
|
|---|
| 31 | *
|
|---|
| 32 | * <p>When encoding a String, the following rules apply:
|
|---|
| 33 | * <ul>
|
|---|
| 34 | * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
|
|---|
| 35 | * through "9" remain the same.
|
|---|
| 36 | * <li>Any additionally specified safe characters remain the same.
|
|---|
| 37 | * <li>If {@code plusForSpace} was specified, the space character " " is
|
|---|
| 38 | * converted into a plus sign "+".
|
|---|
| 39 | * <li>All other characters are converted into one or more bytes using UTF-8
|
|---|
| 40 | * encoding and each byte is then represented by the 3-character string
|
|---|
| 41 | * "%XY", where "XY" is the two-digit, uppercase, hexadecimal representation
|
|---|
| 42 | * of the byte value.
|
|---|
| 43 | * </ul>
|
|---|
| 44 | *
|
|---|
| 45 | * <p>RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!",
|
|---|
| 46 | * "~", "*", "'", "(" and ")". It goes on to state:
|
|---|
| 47 | *
|
|---|
| 48 | * <p><i>Unreserved characters can be escaped without changing the semantics
|
|---|
| 49 | * of the URI, but this should not be done unless the URI is being used
|
|---|
| 50 | * in a context that does not allow the unescaped character to appear.</i>
|
|---|
| 51 | *
|
|---|
| 52 | * <p>For performance reasons the only currently supported character encoding of
|
|---|
| 53 | * this class is UTF-8.
|
|---|
| 54 | *
|
|---|
| 55 | * <p><b>Note</b>: This escaper produces uppercase hexidecimal sequences. From
|
|---|
| 56 | * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
|
|---|
| 57 | * <i>"URI producers and normalizers should use uppercase hexadecimal digits
|
|---|
| 58 | * for all percent-encodings."</i>
|
|---|
| 59 | *
|
|---|
| 60 | *
|
|---|
| 61 | */
|
|---|
| 62 | public class PercentEscaper extends UnicodeEscaper {
|
|---|
| 63 | /**
|
|---|
| 64 | * A string of safe characters that mimics the behavior of
|
|---|
| 65 | * {@link java.net.URLEncoder}.
|
|---|
| 66 | *
|
|---|
| 67 | */
|
|---|
| 68 | public static final String SAFECHARS_URLENCODER = "-_.*";
|
|---|
| 69 |
|
|---|
| 70 | /**
|
|---|
| 71 | * A string of characters that do not need to be encoded when used in URI
|
|---|
| 72 | * path segments, as specified in RFC 3986. Note that some of these
|
|---|
| 73 | * characters do need to be escaped when used in other parts of the URI.
|
|---|
| 74 | */
|
|---|
| 75 | public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;=";
|
|---|
| 76 |
|
|---|
| 77 | /**
|
|---|
| 78 | * A string of characters that do not need to be encoded when used in URI
|
|---|
| 79 | * query strings, as specified in RFC 3986. Note that some of these
|
|---|
| 80 | * characters do need to be escaped when used in other parts of the URI.
|
|---|
| 81 | */
|
|---|
| 82 | public static final String SAFEQUERYSTRINGCHARS_URLENCODER
|
|---|
| 83 | = "-_.!~*'()@:$,;/?:";
|
|---|
| 84 |
|
|---|
| 85 | // In some uri escapers spaces are escaped to '+'
|
|---|
| 86 | private static final char[] URI_ESCAPED_SPACE = { '+' };
|
|---|
| 87 |
|
|---|
| 88 | private static final char[] UPPER_HEX_DIGITS =
|
|---|
| 89 | "0123456789ABCDEF".toCharArray();
|
|---|
| 90 |
|
|---|
| 91 | /**
|
|---|
| 92 | * If true we should convert space to the {@code +} character.
|
|---|
| 93 | */
|
|---|
| 94 | private final boolean plusForSpace;
|
|---|
| 95 |
|
|---|
| 96 | /**
|
|---|
| 97 | * An array of flags where for any {@code char c} if {@code safeOctets[c]} is
|
|---|
| 98 | * true then {@code c} should remain unmodified in the output. If
|
|---|
| 99 | * {@code c > safeOctets.length} then it should be escaped.
|
|---|
| 100 | */
|
|---|
| 101 | private final boolean[] safeOctets;
|
|---|
| 102 |
|
|---|
| 103 | /**
|
|---|
| 104 | * Constructs a URI escaper with the specified safe characters and optional
|
|---|
| 105 | * handling of the space character.
|
|---|
| 106 | *
|
|---|
| 107 | * @param safeChars a non null string specifying additional safe characters
|
|---|
| 108 | * for this escaper (the ranges 0..9, a..z and A..Z are always safe and
|
|---|
| 109 | * should not be specified here)
|
|---|
| 110 | * @param plusForSpace true if ASCII space should be escaped to {@code +}
|
|---|
| 111 | * rather than {@code %20}
|
|---|
| 112 | * @throws IllegalArgumentException if any of the parameters were invalid
|
|---|
| 113 | */
|
|---|
| 114 | public PercentEscaper(String safeChars, boolean plusForSpace) {
|
|---|
| 115 | // Avoid any misunderstandings about the behavior of this escaper
|
|---|
| 116 | if (safeChars.matches(".*[0-9A-Za-z].*")) {
|
|---|
| 117 | throw new IllegalArgumentException(
|
|---|
| 118 | "Alphanumeric characters are always 'safe' and should not be " +
|
|---|
| 119 | "explicitly specified");
|
|---|
| 120 | }
|
|---|
| 121 | // Avoid ambiguous parameters. Safe characters are never modified so if
|
|---|
| 122 | // space is a safe character then setting plusForSpace is meaningless.
|
|---|
| 123 | if (plusForSpace && safeChars.contains(" ")) {
|
|---|
| 124 | throw new IllegalArgumentException(
|
|---|
| 125 | "plusForSpace cannot be specified when space is a 'safe' character");
|
|---|
| 126 | }
|
|---|
| 127 | if (safeChars.contains("%")) {
|
|---|
| 128 | throw new IllegalArgumentException(
|
|---|
| 129 | "The '%' character cannot be specified as 'safe'");
|
|---|
| 130 | }
|
|---|
| 131 | this.plusForSpace = plusForSpace;
|
|---|
| 132 | this.safeOctets = createSafeOctets(safeChars);
|
|---|
| 133 | }
|
|---|
| 134 |
|
|---|
| 135 | /**
|
|---|
| 136 | * Creates a boolean[] with entries corresponding to the character values
|
|---|
| 137 | * for 0-9, A-Z, a-z and those specified in safeChars set to true. The array
|
|---|
| 138 | * is as small as is required to hold the given character information.
|
|---|
| 139 | */
|
|---|
| 140 | private static boolean[] createSafeOctets(String safeChars) {
|
|---|
| 141 | int maxChar = 'z';
|
|---|
| 142 | char[] safeCharArray = safeChars.toCharArray();
|
|---|
| 143 | for (char c : safeCharArray) {
|
|---|
| 144 | maxChar = Math.max(c, maxChar);
|
|---|
| 145 | }
|
|---|
| 146 | boolean[] octets = new boolean[maxChar + 1];
|
|---|
| 147 | for (int c = '0'; c <= '9'; c++) {
|
|---|
| 148 | octets[c] = true;
|
|---|
| 149 | }
|
|---|
| 150 | for (int c = 'A'; c <= 'Z'; c++) {
|
|---|
| 151 | octets[c] = true;
|
|---|
| 152 | }
|
|---|
| 153 | for (int c = 'a'; c <= 'z'; c++) {
|
|---|
| 154 | octets[c] = true;
|
|---|
| 155 | }
|
|---|
| 156 | for (char c : safeCharArray) {
|
|---|
| 157 | octets[c] = true;
|
|---|
| 158 | }
|
|---|
| 159 | return octets;
|
|---|
| 160 | }
|
|---|
| 161 |
|
|---|
| 162 | /*
|
|---|
| 163 | * Overridden for performance. For unescaped strings this improved the
|
|---|
| 164 | * performance of the uri escaper from ~760ns to ~400ns as measured by
|
|---|
| 165 | * {@link CharEscapersBenchmark}.
|
|---|
| 166 | */
|
|---|
| 167 | @Override
|
|---|
| 168 | protected int nextEscapeIndex(CharSequence csq, int index, int end) {
|
|---|
| 169 | for (; index < end; index++) {
|
|---|
| 170 | char c = csq.charAt(index);
|
|---|
| 171 | if (c >= safeOctets.length || !safeOctets[c]) {
|
|---|
| 172 | break;
|
|---|
| 173 | }
|
|---|
| 174 | }
|
|---|
| 175 | return index;
|
|---|
| 176 | }
|
|---|
| 177 |
|
|---|
| 178 | /*
|
|---|
| 179 | * Overridden for performance. For unescaped strings this improved the
|
|---|
| 180 | * performance of the uri escaper from ~400ns to ~170ns as measured by
|
|---|
| 181 | * {@link CharEscapersBenchmark}.
|
|---|
| 182 | */
|
|---|
| 183 | @Override
|
|---|
| 184 | public String escape(String s) {
|
|---|
| 185 | int slen = s.length();
|
|---|
| 186 | for (int index = 0; index < slen; index++) {
|
|---|
| 187 | char c = s.charAt(index);
|
|---|
| 188 | if (c >= safeOctets.length || !safeOctets[c]) {
|
|---|
| 189 | return escapeSlow(s, index);
|
|---|
| 190 | }
|
|---|
| 191 | }
|
|---|
| 192 | return s;
|
|---|
| 193 | }
|
|---|
| 194 |
|
|---|
| 195 | /**
|
|---|
| 196 | * Escapes the given Unicode code point in UTF-8.
|
|---|
| 197 | */
|
|---|
| 198 | @Override
|
|---|
| 199 | protected char[] escape(int cp) {
|
|---|
| 200 | // We should never get negative values here but if we do it will throw an
|
|---|
| 201 | // IndexOutOfBoundsException, so at least it will get spotted.
|
|---|
| 202 | if (cp < safeOctets.length && safeOctets[cp]) {
|
|---|
| 203 | return null;
|
|---|
| 204 | } else if (cp == ' ' && plusForSpace) {
|
|---|
| 205 | return URI_ESCAPED_SPACE;
|
|---|
| 206 | } else if (cp <= 0x7F) {
|
|---|
| 207 | // Single byte UTF-8 characters
|
|---|
| 208 | // Start with "%--" and fill in the blanks
|
|---|
| 209 | char[] dest = new char[3];
|
|---|
| 210 | dest[0] = '%';
|
|---|
| 211 | dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
|
|---|
| 212 | dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
|
|---|
| 213 | return dest;
|
|---|
| 214 | } else if (cp <= 0x7ff) {
|
|---|
| 215 | // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
|
|---|
| 216 | // Start with "%--%--" and fill in the blanks
|
|---|
| 217 | char[] dest = new char[6];
|
|---|
| 218 | dest[0] = '%';
|
|---|
| 219 | dest[3] = '%';
|
|---|
| 220 | dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
|
|---|
| 221 | cp >>>= 4;
|
|---|
| 222 | dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
|
|---|
| 223 | cp >>>= 2;
|
|---|
| 224 | dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
|
|---|
| 225 | cp >>>= 4;
|
|---|
| 226 | dest[1] = UPPER_HEX_DIGITS[0xC | cp];
|
|---|
| 227 | return dest;
|
|---|
| 228 | } else if (cp <= 0xffff) {
|
|---|
| 229 | // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
|
|---|
| 230 | // Start with "%E-%--%--" and fill in the blanks
|
|---|
| 231 | char[] dest = new char[9];
|
|---|
| 232 | dest[0] = '%';
|
|---|
| 233 | dest[1] = 'E';
|
|---|
| 234 | dest[3] = '%';
|
|---|
| 235 | dest[6] = '%';
|
|---|
| 236 | dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
|
|---|
| 237 | cp >>>= 4;
|
|---|
| 238 | dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
|
|---|
| 239 | cp >>>= 2;
|
|---|
| 240 | dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
|
|---|
| 241 | cp >>>= 4;
|
|---|
| 242 | dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
|
|---|
| 243 | cp >>>= 2;
|
|---|
| 244 | dest[2] = UPPER_HEX_DIGITS[cp];
|
|---|
| 245 | return dest;
|
|---|
| 246 | } else if (cp <= 0x10ffff) {
|
|---|
| 247 | char[] dest = new char[12];
|
|---|
| 248 | // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
|
|---|
| 249 | // Start with "%F-%--%--%--" and fill in the blanks
|
|---|
| 250 | dest[0] = '%';
|
|---|
| 251 | dest[1] = 'F';
|
|---|
| 252 | dest[3] = '%';
|
|---|
| 253 | dest[6] = '%';
|
|---|
| 254 | dest[9] = '%';
|
|---|
| 255 | dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
|
|---|
| 256 | cp >>>= 4;
|
|---|
| 257 | dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
|
|---|
| 258 | cp >>>= 2;
|
|---|
| 259 | dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
|
|---|
| 260 | cp >>>= 4;
|
|---|
| 261 | dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
|
|---|
| 262 | cp >>>= 2;
|
|---|
| 263 | dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
|
|---|
| 264 | cp >>>= 4;
|
|---|
| 265 | dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
|
|---|
| 266 | cp >>>= 2;
|
|---|
| 267 | dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
|
|---|
| 268 | return dest;
|
|---|
| 269 | } else {
|
|---|
| 270 | // If this ever happens it is due to bug in UnicodeEscaper, not bad input.
|
|---|
| 271 | throw new IllegalArgumentException(
|
|---|
| 272 | "Invalid unicode character value " + cp);
|
|---|
| 273 | }
|
|---|
| 274 | }
|
|---|
| 275 | }
|
|---|