source: josm/trunk/src/org/openstreetmap/josm/data/validation/routines/UrlValidator.java@ 7779

Last change on this file since 7779 was 7779, checked in by Don-vip, 9 years ago

fix #10810 - update Apache Validator routines to latest version in trunk to support recent TLDs in DomainValidator

File size: 17.2 KB
Line 
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17package org.openstreetmap.josm.data.validation.routines;
18
19import static org.openstreetmap.josm.tools.I18n.tr;
20
21import java.util.Arrays;
22import java.util.Collections;
23import java.util.HashSet;
24import java.util.Set;
25import java.util.regex.Matcher;
26import java.util.regex.Pattern;
27
28/**
29 * <p><b>URL Validation</b> routines.</p>
30 * Behavior of validation is modified by passing in options:
31 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path
32 * component.</li>
33 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is
34 * included then fragments are flagged as illegal.</li>
35 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
36 * considered valid schemes. Enabling this option will let any scheme pass validation.</li>
37 *
38 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
39 * http://javascript.internet.com. However, this validation now bears little resemblance
40 * to the php original.</p>
41 * <pre>
42 * Example of usage:
43 * Construct a UrlValidator with valid schemes of "http", and "https".
44 *
45 * String[] schemes = {"http","https"}.
46 * UrlValidator urlValidator = new UrlValidator(schemes);
47 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
48 * System.out.println("url is valid");
49 * } else {
50 * System.out.println("url is invalid");
51 * }
52 *
53 * prints "url is invalid"
54 * If instead the default constructor is used.
55 *
56 * UrlValidator urlValidator = new UrlValidator();
57 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
58 * System.out.println("url is valid");
59 * } else {
60 * System.out.println("url is invalid");
61 * }
62 *
63 * prints out "url is valid"
64 * </pre>
65 *
66 * @see
67 * <a href="http://www.ietf.org/rfc/rfc2396.txt">
68 * Uniform Resource Identifiers (URI): Generic Syntax
69 * </a>
70 *
71 * @version $Revision: 1640269 $ $Date: 2014-11-18 02:28:56 UTC (Tue, 18 Nov 2014) $
72 * @since Validator 1.4
73 */
74public class UrlValidator extends AbstractValidator {
75
76 /**
77 * Allows all validly formatted schemes to pass validation instead of
78 * supplying a set of valid schemes.
79 */
80 public static final long ALLOW_ALL_SCHEMES = 1 << 0;
81
82 /**
83 * Allow two slashes in the path component of the URL.
84 */
85 public static final long ALLOW_2_SLASHES = 1 << 1;
86
87 /**
88 * Enabling this options disallows any URL fragments.
89 */
90 public static final long NO_FRAGMENTS = 1 << 2;
91
92 /**
93 * Allow local URLs, such as http://localhost/ or http://machine/ .
94 * This enables a broad-brush check, for complex local machine name
95 * validation requirements you should create your validator with
96 * a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)})
97 */
98 public static final long ALLOW_LOCAL_URLS = 1 << 3;
99
100 // Drop numeric, and "+-." for now
101 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\.";
102
103 /**
104 * This expression derived/taken from the BNF for URI (RFC2396).
105 */
106 private static final String URL_REGEX =
107 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?";
108
109 private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX);
110
111 /**
112 * Schema/Protocol (ie. http:, ftp:, file:, etc).
113 */
114 private static final int PARSE_URL_SCHEME = 2;
115
116 /**
117 * Includes hostname/ip and port number.
118 */
119 private static final int PARSE_URL_AUTHORITY = 4;
120
121 private static final int PARSE_URL_PATH = 5;
122
123 private static final int PARSE_URL_QUERY = 7;
124
125 private static final int PARSE_URL_FRAGMENT = 9;
126
127 /**
128 * Protocol (ie. http:, ftp:,https:).
129 */
130 private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*";
131 private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX);
132
133 private static final String AUTHORITY_REGEX =
134 "^([" + AUTHORITY_CHARS_REGEX + "]*)(:\\d*)?(.*)?";
135
136 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX);
137
138 private static final int PARSE_AUTHORITY_HOST_IP = 1;
139
140 private static final int PARSE_AUTHORITY_PORT = 2;
141
142 /**
143 * Should always be empty.
144 */
145 private static final int PARSE_AUTHORITY_EXTRA = 3;
146
147 private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$";
148 private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX);
149
150 private static final String QUERY_REGEX = "^(.*)$";
151 private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX);
152
153 private static final String LEGAL_ASCII_REGEX = "^\\p{ASCII}+$";
154 private static final Pattern ASCII_PATTERN = Pattern.compile(LEGAL_ASCII_REGEX);
155
156 private static final String PORT_REGEX = "^:(\\d{1,5})$";
157 private static final Pattern PORT_PATTERN = Pattern.compile(PORT_REGEX);
158
159 private static final String SIMPLE_WEBSITE = "^www[.].*";
160 private static final Pattern SIMPLE_WEBSITE_PATTERN = Pattern.compile(SIMPLE_WEBSITE);
161
162 /**
163 * Holds the set of current validation options.
164 */
165 private final long options;
166
167 /**
168 * The set of schemes that are allowed to be in a URL.
169 */
170 private final Set<String> allowedSchemes;
171
172 /**
173 * Regular expressions used to manually validate authorities if IANA
174 * domain name validation isn't desired.
175 */
176 private final RegexValidator authorityValidator;
177
178 /**
179 * If no schemes are provided, default to this set.
180 */
181 private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"};
182
183 /**
184 * Singleton instance of this class with default schemes and options.
185 */
186 private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator();
187
188 /**
189 * Returns the singleton instance of this class with default schemes and options.
190 * @return singleton instance with default schemes and options
191 */
192 public static UrlValidator getInstance() {
193 return DEFAULT_URL_VALIDATOR;
194 }
195
196 /**
197 * Create a UrlValidator with default properties.
198 */
199 public UrlValidator() {
200 this(null);
201 }
202
203 /**
204 * Behavior of validation is modified by passing in several strings options:
205 * @param schemes Pass in one or more url schemes to consider valid, passing in
206 * a null will default to "http,https,ftp" being valid.
207 * If a non-null schemes is specified then all valid schemes must
208 * be specified. Setting the ALLOW_ALL_SCHEMES option will
209 * ignore the contents of schemes.
210 */
211 public UrlValidator(String[] schemes) {
212 this(schemes, 0L);
213 }
214
215 /**
216 * Initialize a UrlValidator with the given validation options.
217 * @param options The options should be set using the public constants declared in
218 * this class. To set multiple options you simply add them together. For example,
219 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
220 */
221 public UrlValidator(long options) {
222 this(null, null, options);
223 }
224
225 /**
226 * Behavior of validation is modified by passing in options:
227 * @param schemes The set of valid schemes.
228 * @param options The options should be set using the public constants declared in
229 * this class. To set multiple options you simply add them together. For example,
230 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
231 */
232 public UrlValidator(String[] schemes, long options) {
233 this(schemes, null, options);
234 }
235
236 /**
237 * Initialize a UrlValidator with the given validation options.
238 * @param authorityValidator Regular expression validator used to validate the authority part
239 * @param options Validation options. Set using the public constants of this class.
240 * To set multiple options, simply add them together:
241 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
242 * enables both of those options.
243 */
244 public UrlValidator(RegexValidator authorityValidator, long options) {
245 this(null, authorityValidator, options);
246 }
247
248 /**
249 * Customizable constructor. Validation behavior is modifed by passing in options.
250 * @param schemes the set of valid schemes
251 * @param authorityValidator Regular expression validator used to validate the authority part
252 * @param options Validation options. Set using the public constants of this class.
253 * To set multiple options, simply add them together:
254 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
255 * enables both of those options.
256 */
257 public UrlValidator(String[] schemes, RegexValidator authorityValidator, long options) {
258 this.options = options;
259
260 if (isOn(ALLOW_ALL_SCHEMES)) {
261 this.allowedSchemes = Collections.emptySet();
262 } else {
263 if (schemes == null) {
264 schemes = DEFAULT_SCHEMES;
265 }
266 this.allowedSchemes = new HashSet<>();
267 this.allowedSchemes.addAll(Arrays.asList(schemes));
268 }
269
270 this.authorityValidator = authorityValidator;
271
272 }
273
274 /**
275 * <p>Checks if a field has a valid url address.</p>
276 *
277 * @param value The value validation is being performed on. A <code>null</code>
278 * value is considered invalid.
279 * @return true if the url is valid.
280 */
281 @Override
282 public boolean isValid(String value) {
283 if (value == null) {
284 return false;
285 }
286
287 if (!ASCII_PATTERN.matcher(value).matches()) {
288 setErrorMessage(tr("URL contains non-ascii characters"));
289 setFix(value.replaceAll("[^\\p{ASCII}]+", ""));
290 return false;
291 }
292
293 // Check the whole url address structure
294 Matcher urlMatcher = URL_PATTERN.matcher(value);
295 if (!urlMatcher.matches()) {
296 setErrorMessage(tr("URL is invalid"));
297 return false;
298 }
299
300 String scheme = urlMatcher.group(PARSE_URL_SCHEME);
301 if (!isValidScheme(scheme)) {
302 setErrorMessage(tr("URL contains an invalid protocol: {0}", scheme));
303 if (SIMPLE_WEBSITE_PATTERN.matcher(value).matches()) {
304 setFix("http://" + value);
305 }
306 return false;
307 }
308
309 String authority = urlMatcher.group(PARSE_URL_AUTHORITY);
310 if ("file".equals(scheme) && "".equals(authority)) {
311 // Special case - file: allows an empty authority
312 } else {
313 // Validate the authority
314 if (!isValidAuthority(authority)) {
315 setErrorMessage(tr("URL contains an invalid authority: {0}", authority));
316 return false;
317 }
318 }
319
320 String path = urlMatcher.group(PARSE_URL_PATH);
321 if (!isValidPath(path)) {
322 setErrorMessage(tr("URL contains an invalid path: {0}", path));
323 return false;
324 }
325
326 String query = urlMatcher.group(PARSE_URL_QUERY);
327 if (!isValidQuery(query)) {
328 setErrorMessage(tr("URL contains an invalid query: {0}", query));
329 return false;
330 }
331
332 String fragment = urlMatcher.group(PARSE_URL_FRAGMENT);
333 if (!isValidFragment(fragment)) {
334 setErrorMessage(tr("URL contains an invalid fragment: {0}", fragment));
335 return false;
336 }
337
338 return true;
339 }
340
341 /**
342 * Validate scheme. If schemes[] was initialized to a non null,
343 * then only those scheme's are allowed. Note this is slightly different
344 * than for the constructor.
345 * @param scheme The scheme to validate. A <code>null</code> value is considered
346 * invalid.
347 * @return true if valid.
348 */
349 protected boolean isValidScheme(String scheme) {
350 if (scheme == null) {
351 return false;
352 }
353
354 if (!SCHEME_PATTERN.matcher(scheme).matches()) {
355 return false;
356 }
357
358 if (isOff(ALLOW_ALL_SCHEMES)) {
359
360 if (!this.allowedSchemes.contains(scheme)) {
361 return false;
362 }
363 }
364
365 return true;
366 }
367
368 /**
369 * Returns true if the authority is properly formatted. An authority is the combination
370 * of hostname and port. A <code>null</code> authority value is considered invalid.
371 * @param authority Authority value to validate.
372 * @return true if authority (hostname and port) is valid.
373 */
374 protected boolean isValidAuthority(String authority) {
375 if (authority == null) {
376 return false;
377 }
378
379 // check manual authority validation if specified
380 if (authorityValidator != null) {
381 if (authorityValidator.isValid(authority)) {
382 return true;
383 }
384 }
385
386 Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authority);
387 if (!authorityMatcher.matches()) {
388 return false;
389 }
390
391 String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
392 // check if authority is hostname or IP address:
393 // try a hostname first since that's much more likely
394 DomainValidator domainValidator = DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS));
395 if (!domainValidator.isValid(hostLocation)) {
396 // try an IP address
397 InetAddressValidator inetAddressValidator =
398 InetAddressValidator.getInstance();
399 if (!inetAddressValidator.isValid(hostLocation)) {
400 // isn't either one, so the URL is invalid
401 return false;
402 }
403 }
404
405 String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
406 if (port != null) {
407 if (!PORT_PATTERN.matcher(port).matches()) {
408 return false;
409 }
410 }
411
412 String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
413 if (extra != null && extra.trim().length() > 0){
414 return false;
415 }
416
417 return true;
418 }
419
420 /**
421 * Returns true if the path is valid. A <code>null</code> value is considered invalid.
422 * @param path Path value to validate.
423 * @return true if path is valid.
424 */
425 protected boolean isValidPath(String path) {
426 if (path == null) {
427 return false;
428 }
429
430 if (!PATH_PATTERN.matcher(path).matches()) {
431 return false;
432 }
433
434 int slash2Count = countToken("//", path);
435 if (isOff(ALLOW_2_SLASHES) && (slash2Count > 0)) {
436 return false;
437 }
438
439 int slashCount = countToken("/", path);
440 int dot2Count = countToken("..", path);
441 if (dot2Count > 0) {
442 if ((slashCount - slash2Count - 1) <= dot2Count) {
443 return false;
444 }
445 }
446
447 return true;
448 }
449
450 /**
451 * Returns true if the query is null or it's a properly formatted query string.
452 * @param query Query value to validate.
453 * @return true if query is valid.
454 */
455 protected boolean isValidQuery(String query) {
456 if (query == null) {
457 return true;
458 }
459
460 return QUERY_PATTERN.matcher(query).matches();
461 }
462
463 /**
464 * Returns true if the given fragment is null or fragments are allowed.
465 * @param fragment Fragment value to validate.
466 * @return true if fragment is valid.
467 */
468 protected boolean isValidFragment(String fragment) {
469 if (fragment == null) {
470 return true;
471 }
472
473 return isOff(NO_FRAGMENTS);
474 }
475
476 /**
477 * Returns the number of times the token appears in the target.
478 * @param token Token value to be counted.
479 * @param target Target value to count tokens in.
480 * @return the number of tokens.
481 */
482 protected int countToken(String token, String target) {
483 int tokenIndex = 0;
484 int count = 0;
485 while (tokenIndex != -1) {
486 tokenIndex = target.indexOf(token, tokenIndex);
487 if (tokenIndex > -1) {
488 tokenIndex++;
489 count++;
490 }
491 }
492 return count;
493 }
494
495 /**
496 * Tests whether the given flag is on. If the flag is not a power of 2
497 * (ie. 3) this tests whether the combination of flags is on.
498 *
499 * @param flag Flag value to check.
500 *
501 * @return whether the specified flag value is on.
502 */
503 private boolean isOn(long flag) {
504 return (this.options & flag) > 0;
505 }
506
507 /**
508 * Tests whether the given flag is off. If the flag is not a power of 2
509 * (ie. 3) this tests whether the combination of flags is off.
510 *
511 * @param flag Flag value to check.
512 *
513 * @return whether the specified flag value is off.
514 */
515 private boolean isOff(long flag) {
516 return (this.options & flag) == 0;
517 }
518}
Note: See TracBrowser for help on using the repository browser.