source: josm/trunk/src/org/openstreetmap/josm/data/validation/routines/UrlValidator.java@ 11452

Last change on this file since 11452 was 11452, checked in by Don-vip, 7 years ago

sonar - fb-contrib:SEO_SUBOPTIMAL_EXPRESSION_ORDER - Performance - Method orders expressions in a conditional in a sub optimal way

  • Property svn:eol-style set to native
File size: 19.4 KB
Line 
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17package org.openstreetmap.josm.data.validation.routines;
18
19import static org.openstreetmap.josm.tools.I18n.tr;
20
21import java.net.URI;
22import java.net.URISyntaxException;
23import java.util.Collections;
24import java.util.HashSet;
25import java.util.Locale;
26import java.util.Set;
27import java.util.regex.Matcher;
28import java.util.regex.Pattern;
29
30import org.openstreetmap.josm.Main;
31
32/**
33 * <p><b>URL Validation</b> routines.</p>
34 * Behavior of validation is modified by passing in options:
35 * <ul>
36 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path
37 * component.</li>
38 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is
39 * included then fragments are flagged as illegal.</li>
40 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
41 * considered valid schemes. Enabling this option will let any scheme pass validation.</li>
42 * </ul>
43 *
44 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
45 * http://javascript.internet.com. However, this validation now bears little resemblance
46 * to the php original.</p>
47 * <pre>
48 * Example of usage:
49 * Construct a UrlValidator with valid schemes of "http", and "https".
50 *
51 * String[] schemes = {"http","https"}.
52 * UrlValidator urlValidator = new UrlValidator(schemes);
53 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
54 * System.out.println("url is valid");
55 * } else {
56 * System.out.println("url is invalid");
57 * }
58 *
59 * prints "url is invalid"
60 * If instead the default constructor is used.
61 *
62 * UrlValidator urlValidator = new UrlValidator();
63 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
64 * System.out.println("url is valid");
65 * } else {
66 * System.out.println("url is invalid");
67 * }
68 *
69 * prints out "url is valid"
70 * </pre>
71 *
72 * @version $Revision: 1741724 $
73 * @see
74 * <a href="http://www.ietf.org/rfc/rfc2396.txt">
75 * Uniform Resource Identifiers (URI): Generic Syntax
76 * </a>
77 *
78 * @since Validator 1.4
79 */
80public class UrlValidator extends AbstractValidator {
81
82 /**
83 * Allows all validly formatted schemes to pass validation instead of
84 * supplying a set of valid schemes.
85 */
86 public static final long ALLOW_ALL_SCHEMES = 1 << 0;
87
88 /**
89 * Allow two slashes in the path component of the URL.
90 */
91 public static final long ALLOW_2_SLASHES = 1 << 1;
92
93 /**
94 * Enabling this options disallows any URL fragments.
95 */
96 public static final long NO_FRAGMENTS = 1 << 2;
97
98 /**
99 * Allow local URLs, such as http://localhost/ or http://machine/ .
100 * This enables a broad-brush check, for complex local machine name
101 * validation requirements you should create your validator with
102 * a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)})
103 */
104 public static final long ALLOW_LOCAL_URLS = 1 << 3; // CHECKSTYLE IGNORE MagicNumber
105
106 /**
107 * This expression derived/taken from the BNF for URI (RFC2396).
108 */
109 private static final String URL_REGEX =
110 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?";
111 // 12 3 4 5 6 7 8 9
112 private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX);
113
114 /**
115 * Schema/Protocol (ie. http:, ftp:, file:, etc).
116 */
117 private static final int PARSE_URL_SCHEME = 2;
118
119 /**
120 * Includes hostname/ip and port number.
121 */
122 private static final int PARSE_URL_AUTHORITY = 4;
123
124 private static final int PARSE_URL_PATH = 5;
125
126 private static final int PARSE_URL_QUERY = 7;
127
128 private static final int PARSE_URL_FRAGMENT = 9;
129
130 /**
131 * Protocol scheme (e.g. http, ftp, https).
132 */
133 private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*";
134 private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX);
135
136 // Drop numeric, and "+-." for now
137 // TODO does not allow for optional userinfo.
138 // Validation of character set is done by isValidAuthority
139 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; // allows for IPV4 but not IPV6
140 private static final String IPV6_REGEX = "[0-9a-fA-F:]+"; // do this as separate match because : could cause ambiguity with port prefix
141
142 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
143 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
144 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
145 // We assume that password has the same valid chars as user info
146 private static final String USERINFO_CHARS_REGEX = "[a-zA-Z0-9%-._~!$&'()*+,;=]";
147 // since neither ':' nor '@' are allowed chars, we don't need to use non-greedy matching
148 private static final String USERINFO_FIELD_REGEX =
149 USERINFO_CHARS_REGEX + "+:" + // At least one character for the name
150 USERINFO_CHARS_REGEX + "*@"; // password may be absent
151 private static final String AUTHORITY_REGEX =
152 "(?:\\[("+IPV6_REGEX+")\\]|(?:(?:"+USERINFO_FIELD_REGEX+")?([" + AUTHORITY_CHARS_REGEX + "]*)))(:\\d*)?(.*)?";
153 // 1 e.g. user:pass@ 2 3 4
154 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX);
155
156 private static final int PARSE_AUTHORITY_IPV6 = 1;
157
158 private static final int PARSE_AUTHORITY_HOST_IP = 2; // excludes userinfo, if present
159
160 /**
161 * Should always be empty. The code currently allows spaces.
162 */
163 private static final int PARSE_AUTHORITY_EXTRA = 4;
164
165 private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$";
166 private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX);
167
168 private static final String QUERY_REGEX = "^(.*)$";
169 private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX);
170
171 /**
172 * Holds the set of current validation options.
173 */
174 private final long options;
175
176 /**
177 * The set of schemes that are allowed to be in a URL.
178 */
179 private final Set<String> allowedSchemes; // Must be lower-case
180
181 /**
182 * Regular expressions used to manually validate authorities if IANA
183 * domain name validation isn't desired.
184 */
185 private final RegexValidator authorityValidator;
186
187 /**
188 * If no schemes are provided, default to this set.
189 */
190 private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"}; // Must be lower-case
191
192 /**
193 * Singleton instance of this class with default schemes and options.
194 */
195 private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator();
196
197 /**
198 * Returns the singleton instance of this class with default schemes and options.
199 * @return singleton instance with default schemes and options
200 */
201 public static UrlValidator getInstance() {
202 return DEFAULT_URL_VALIDATOR;
203 }
204
205 /**
206 * Create a UrlValidator with default properties.
207 */
208 public UrlValidator() {
209 this((String[]) null);
210 }
211
212 /**
213 * Behavior of validation is modified by passing in several strings options:
214 * @param schemes Pass in one or more url schemes to consider valid, passing in
215 * a null will default to "http,https,ftp" being valid.
216 * If a non-null schemes is specified then all valid schemes must
217 * be specified. Setting the ALLOW_ALL_SCHEMES option will
218 * ignore the contents of schemes.
219 */
220 public UrlValidator(String ... schemes) {
221 this(schemes, 0L);
222 }
223
224 /**
225 * Initialize a UrlValidator with the given validation options.
226 * @param options The options should be set using the public constants declared in
227 * this class. To set multiple options you simply add them together. For example,
228 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
229 */
230 public UrlValidator(long options) {
231 this(null, null, options);
232 }
233
234 /**
235 * Behavior of validation is modified by passing in options:
236 * @param schemes The set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
237 * @param options The options should be set using the public constants declared in
238 * this class. To set multiple options you simply add them together. For example,
239 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
240 */
241 public UrlValidator(String[] schemes, long options) {
242 this(schemes, null, options);
243 }
244
245 /**
246 * Initialize a UrlValidator with the given validation options.
247 * @param authorityValidator Regular expression validator used to validate the authority part
248 * This allows the user to override the standard set of domains.
249 * @param options Validation options. Set using the public constants of this class.
250 * To set multiple options, simply add them together:
251 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
252 * enables both of those options.
253 */
254 public UrlValidator(RegexValidator authorityValidator, long options) {
255 this(null, authorityValidator, options);
256 }
257
258 /**
259 * Customizable constructor. Validation behavior is modifed by passing in options.
260 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set.
261 * @param authorityValidator Regular expression validator used to validate the authority part
262 * @param options Validation options. Set using the public constants of this class.
263 * To set multiple options, simply add them together:
264 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
265 * enables both of those options.
266 */
267 public UrlValidator(String[] schemes, RegexValidator authorityValidator, long options) {
268 this.options = options;
269
270 if (isOn(ALLOW_ALL_SCHEMES)) {
271 allowedSchemes = Collections.emptySet();
272 } else {
273 if (schemes == null) {
274 schemes = DEFAULT_SCHEMES;
275 }
276 allowedSchemes = new HashSet<>(schemes.length);
277 for (int i = 0; i < schemes.length; i++) {
278 allowedSchemes.add(schemes[i].toLowerCase(Locale.ENGLISH));
279 }
280 }
281
282 this.authorityValidator = authorityValidator;
283 }
284
285 /**
286 * <p>Checks if a field has a valid url address.</p>
287 *
288 * Note that the method calls #isValidAuthority()
289 * which checks that the domain is valid.
290 *
291 * @param value The value validation is being performed on. A <code>null</code>
292 * value is considered invalid.
293 * @return true if the url is valid.
294 */
295 @Override
296 public boolean isValid(String value) {
297 if (value == null) {
298 return false;
299 }
300
301 // Check the whole url address structure
302 Matcher urlMatcher = URL_PATTERN.matcher(value);
303 if (!urlMatcher.matches()) {
304 setErrorMessage(tr("URL is invalid"));
305 return false;
306 }
307
308 String scheme = urlMatcher.group(PARSE_URL_SCHEME);
309 if (!isValidScheme(scheme)) {
310 setErrorMessage(tr("URL contains an invalid protocol: {0}", scheme));
311 return false;
312 }
313
314 String authority = urlMatcher.group(PARSE_URL_AUTHORITY);
315 if ("file".equals(scheme)) { // Special case - file: allows an empty authority
316 if (!"".equals(authority) && authority.contains(":")) { // but cannot allow trailing :
317 setErrorMessage(tr("URL contains an invalid authority: {0}", authority));
318 return false;
319 }
320 // drop through to continue validation
321 } else { // not file:
322 // Validate the authority
323 if (!isValidAuthority(authority)) {
324 setErrorMessage(tr("URL contains an invalid authority: {0}", authority));
325 return false;
326 }
327 }
328
329 String path = urlMatcher.group(PARSE_URL_PATH);
330 if (!isValidPath(path)) {
331 setErrorMessage(tr("URL contains an invalid path: {0}", path));
332 return false;
333 }
334
335 String query = urlMatcher.group(PARSE_URL_QUERY);
336 if (!isValidQuery(query)) {
337 setErrorMessage(tr("URL contains an invalid query: {0}", query));
338 return false;
339 }
340
341 String fragment = urlMatcher.group(PARSE_URL_FRAGMENT);
342 if (!isValidFragment(fragment)) {
343 setErrorMessage(tr("URL contains an invalid fragment: {0}", fragment));
344 return false;
345 }
346
347 return true;
348 }
349
350 @Override
351 public String getValidatorName() {
352 return tr("URL validator");
353 }
354
355 /**
356 * Validate scheme. If schemes[] was initialized to a non null,
357 * then only those schemes are allowed.
358 * Otherwise the default schemes are "http", "https", "ftp".
359 * Matching is case-blind.
360 * @param scheme The scheme to validate. A <code>null</code> value is considered
361 * invalid.
362 * @return true if valid.
363 */
364 protected boolean isValidScheme(String scheme) {
365 if (scheme == null) {
366 return false;
367 }
368
369 // TODO could be removed if external schemes were checked in the ctor before being stored
370 if (!SCHEME_PATTERN.matcher(scheme).matches()) {
371 return false;
372 }
373
374 if (isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme.toLowerCase(Locale.ENGLISH))) {
375 return false;
376 }
377
378 return true;
379 }
380
381 /**
382 * Returns true if the authority is properly formatted. An authority is the combination
383 * of hostname and port. A <code>null</code> authority value is considered invalid.
384 * Note: this implementation validates the domain unless a RegexValidator was provided.
385 * If a RegexValidator was supplied and it matches, then the authority is regarded
386 * as valid with no further checks, otherwise the method checks against the
387 * AUTHORITY_PATTERN and the DomainValidator (ALLOW_LOCAL_URLS)
388 * @param authority Authority value to validate, alllows IDN
389 * @return true if authority (hostname and port) is valid.
390 */
391 protected boolean isValidAuthority(String authority) {
392 if (authority == null) {
393 return false;
394 }
395
396 // check manual authority validation if specified
397 if (authorityValidator != null && authorityValidator.isValid(authority)) {
398 return true;
399 }
400 // convert to ASCII if possible
401 final String authorityASCII = DomainValidator.unicodeToASCII(authority);
402
403 Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authorityASCII);
404 if (!authorityMatcher.matches()) {
405 return false;
406 }
407
408 // We have to process IPV6 separately because that is parsed in a different group
409 String ipv6 = authorityMatcher.group(PARSE_AUTHORITY_IPV6);
410 if (ipv6 != null) {
411 InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance();
412 if (!inetAddressValidator.isValidInet6Address(ipv6)) {
413 return false;
414 }
415 } else {
416 String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
417 // check if authority is hostname or IP address:
418 // try a hostname first since that's much more likely
419 DomainValidator domainValidator = DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS));
420 if (!domainValidator.isValid(hostLocation)) {
421 // try an IPv4 address
422 InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance();
423 if (!inetAddressValidator.isValidInet4Address(hostLocation)) {
424 // isn't IPv4, so the URL is invalid
425 return false;
426 }
427 }
428 }
429
430 String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
431 if (extra != null && !extra.trim().isEmpty()) {
432 return false;
433 }
434
435 return true;
436 }
437
438 /**
439 * Returns true if the path is valid. A <code>null</code> value is considered invalid.
440 * @param path Path value to validate.
441 * @return true if path is valid.
442 */
443 protected boolean isValidPath(String path) {
444 if (path == null) {
445 return false;
446 }
447
448 if (!PATH_PATTERN.matcher(path).matches()) {
449 return false;
450 }
451
452 try {
453 URI uri = new URI(null, null, path, null);
454 String norm = uri.normalize().getPath();
455 if (norm.startsWith("/../") // Trying to go via the parent dir
456 || "/..".equals(norm)) { // Trying to go to the parent dir
457 return false;
458 }
459 } catch (URISyntaxException e) {
460 Main.trace(e);
461 return false;
462 }
463
464 int slash2Count = countToken("//", path);
465 if (slash2Count > 0 && isOff(ALLOW_2_SLASHES)) {
466 return false;
467 }
468
469 return true;
470 }
471
472 /**
473 * Returns true if the query is null or it's a properly formatted query string.
474 * @param query Query value to validate.
475 * @return true if query is valid.
476 */
477 protected boolean isValidQuery(String query) {
478 if (query == null) {
479 return true;
480 }
481
482 return QUERY_PATTERN.matcher(query).matches();
483 }
484
485 /**
486 * Returns true if the given fragment is null or fragments are allowed.
487 * @param fragment Fragment value to validate.
488 * @return true if fragment is valid.
489 */
490 protected boolean isValidFragment(String fragment) {
491 if (fragment == null) {
492 return true;
493 }
494
495 return isOff(NO_FRAGMENTS);
496 }
497
498 /**
499 * Returns the number of times the token appears in the target.
500 * @param token Token value to be counted.
501 * @param target Target value to count tokens in.
502 * @return the number of tokens.
503 */
504 protected int countToken(String token, String target) {
505 int tokenIndex = 0;
506 int count = 0;
507 while (tokenIndex != -1) {
508 tokenIndex = target.indexOf(token, tokenIndex);
509 if (tokenIndex > -1) {
510 tokenIndex++;
511 count++;
512 }
513 }
514 return count;
515 }
516
517 /**
518 * Tests whether the given flag is on. If the flag is not a power of 2
519 * (ie. 3) this tests whether the combination of flags is on.
520 *
521 * @param flag Flag value to check.
522 *
523 * @return whether the specified flag value is on.
524 */
525 private boolean isOn(long flag) {
526 return (options & flag) > 0;
527 }
528
529 /**
530 * Tests whether the given flag is off. If the flag is not a power of 2
531 * (ie. 3) this tests whether the combination of flags is off.
532 *
533 * @param flag Flag value to check.
534 *
535 * @return whether the specified flag value is off.
536 */
537 private boolean isOff(long flag) {
538 return (options & flag) == 0;
539 }
540}
Note: See TracBrowser for help on using the repository browser.