source: josm/trunk/src/org/openstreetmap/josm/data/validation/routines/UrlValidator.java@ 7489

Last change on this file since 7489 was 7489, checked in by Don-vip, 10 years ago

fix #10393 - Validation of URLs and e-mails in relevant tags, using modified subset of Apache Commons Validator 1.4

File size: 16.9 KB
Line 
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17package org.openstreetmap.josm.data.validation.routines;
18
19import static org.openstreetmap.josm.tools.I18n.tr;
20
21import java.util.Arrays;
22import java.util.Collections;
23import java.util.HashSet;
24import java.util.Set;
25import java.util.regex.Matcher;
26import java.util.regex.Pattern;
27
28/**
29 * <p><b>URL Validation</b> routines.</p>
30 * Behavior of validation is modified by passing in options:
31 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path
32 * component.</li>
33 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is
34 * included then fragments are flagged as illegal.</li>
35 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
36 * considered valid schemes. Enabling this option will let any scheme pass validation.</li>
37 *
38 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
39 * http://javascript.internet.com. However, this validation now bears little resemblance
40 * to the php original.</p>
41 * <pre>
42 * Example of usage:
43 * Construct a UrlValidator with valid schemes of "http", and "https".
44 *
45 * String[] schemes = {"http","https"}.
46 * UrlValidator urlValidator = new UrlValidator(schemes);
47 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
48 * System.out.println("url is valid");
49 * } else {
50 * System.out.println("url is invalid");
51 * }
52 *
53 * prints "url is invalid"
54 * If instead the default constructor is used.
55 *
56 * UrlValidator urlValidator = new UrlValidator();
57 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
58 * System.out.println("url is valid");
59 * } else {
60 * System.out.println("url is invalid");
61 * }
62 *
63 * prints out "url is valid"
64 * </pre>
65 *
66 * @see
67 * <a href="http://www.ietf.org/rfc/rfc2396.txt">
68 * Uniform Resource Identifiers (URI): Generic Syntax
69 * </a>
70 *
71 * @version $Revision: 1227719 $ $Date: 2012-01-05 18:45:51 +0100 (Thu, 05 Jan 2012) $
72 * @since Validator 1.4
73 */
74public class UrlValidator extends AbstractValidator {
75
76 /**
77 * Allows all validly formatted schemes to pass validation instead of
78 * supplying a set of valid schemes.
79 */
80 public static final long ALLOW_ALL_SCHEMES = 1 << 0;
81
82 /**
83 * Allow two slashes in the path component of the URL.
84 */
85 public static final long ALLOW_2_SLASHES = 1 << 1;
86
87 /**
88 * Enabling this options disallows any URL fragments.
89 */
90 public static final long NO_FRAGMENTS = 1 << 2;
91
92 /**
93 * Allow local URLs, such as http://localhost/ or http://machine/ .
94 * This enables a broad-brush check, for complex local machine name
95 * validation requirements you should create your validator with
96 * a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)})
97 */
98 public static final long ALLOW_LOCAL_URLS = 1 << 3;
99
100 // Drop numeric, and "+-." for now
101 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\.";
102
103 /**
104 * This expression derived/taken from the BNF for URI (RFC2396).
105 */
106 private static final String URL_REGEX =
107 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?";
108
109 private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX);
110
111 /**
112 * Schema/Protocol (ie. http:, ftp:, file:, etc).
113 */
114 private static final int PARSE_URL_SCHEME = 2;
115
116 /**
117 * Includes hostname/ip and port number.
118 */
119 private static final int PARSE_URL_AUTHORITY = 4;
120
121 private static final int PARSE_URL_PATH = 5;
122
123 private static final int PARSE_URL_QUERY = 7;
124
125 private static final int PARSE_URL_FRAGMENT = 9;
126
127 /**
128 * Protocol (ie. http:, ftp:,https:).
129 */
130 private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*";
131 private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX);
132
133 private static final String AUTHORITY_REGEX =
134 "^([" + AUTHORITY_CHARS_REGEX + "]*)(:\\d*)?(.*)?";
135
136 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX);
137
138 private static final int PARSE_AUTHORITY_HOST_IP = 1;
139
140 private static final int PARSE_AUTHORITY_PORT = 2;
141
142 /**
143 * Should always be empty.
144 */
145 private static final int PARSE_AUTHORITY_EXTRA = 3;
146
147 private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$";
148 private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX);
149
150 private static final String QUERY_REGEX = "^(.*)$";
151 private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX);
152
153 private static final String LEGAL_ASCII_REGEX = "^\\p{ASCII}+$";
154 private static final Pattern ASCII_PATTERN = Pattern.compile(LEGAL_ASCII_REGEX);
155
156 private static final String PORT_REGEX = "^:(\\d{1,5})$";
157 private static final Pattern PORT_PATTERN = Pattern.compile(PORT_REGEX);
158
159 /**
160 * Holds the set of current validation options.
161 */
162 private final long options;
163
164 /**
165 * The set of schemes that are allowed to be in a URL.
166 */
167 private final Set<String> allowedSchemes;
168
169 /**
170 * Regular expressions used to manually validate authorities if IANA
171 * domain name validation isn't desired.
172 */
173 private final RegexValidator authorityValidator;
174
175 /**
176 * If no schemes are provided, default to this set.
177 */
178 private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"};
179
180 /**
181 * Singleton instance of this class with default schemes and options.
182 */
183 private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator();
184
185 /**
186 * Returns the singleton instance of this class with default schemes and options.
187 * @return singleton instance with default schemes and options
188 */
189 public static UrlValidator getInstance() {
190 return DEFAULT_URL_VALIDATOR;
191 }
192
193 /**
194 * Create a UrlValidator with default properties.
195 */
196 public UrlValidator() {
197 this(null);
198 }
199
200 /**
201 * Behavior of validation is modified by passing in several strings options:
202 * @param schemes Pass in one or more url schemes to consider valid, passing in
203 * a null will default to "http,https,ftp" being valid.
204 * If a non-null schemes is specified then all valid schemes must
205 * be specified. Setting the ALLOW_ALL_SCHEMES option will
206 * ignore the contents of schemes.
207 */
208 public UrlValidator(String[] schemes) {
209 this(schemes, 0L);
210 }
211
212 /**
213 * Initialize a UrlValidator with the given validation options.
214 * @param options The options should be set using the public constants declared in
215 * this class. To set multiple options you simply add them together. For example,
216 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
217 */
218 public UrlValidator(long options) {
219 this(null, null, options);
220 }
221
222 /**
223 * Behavior of validation is modified by passing in options:
224 * @param schemes The set of valid schemes.
225 * @param options The options should be set using the public constants declared in
226 * this class. To set multiple options you simply add them together. For example,
227 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
228 */
229 public UrlValidator(String[] schemes, long options) {
230 this(schemes, null, options);
231 }
232
233 /**
234 * Initialize a UrlValidator with the given validation options.
235 * @param authorityValidator Regular expression validator used to validate the authority part
236 * @param options Validation options. Set using the public constants of this class.
237 * To set multiple options, simply add them together:
238 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
239 * enables both of those options.
240 */
241 public UrlValidator(RegexValidator authorityValidator, long options) {
242 this(null, authorityValidator, options);
243 }
244
245 /**
246 * Customizable constructor. Validation behavior is modifed by passing in options.
247 * @param schemes the set of valid schemes
248 * @param authorityValidator Regular expression validator used to validate the authority part
249 * @param options Validation options. Set using the public constants of this class.
250 * To set multiple options, simply add them together:
251 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p>
252 * enables both of those options.
253 */
254 public UrlValidator(String[] schemes, RegexValidator authorityValidator, long options) {
255 this.options = options;
256
257 if (isOn(ALLOW_ALL_SCHEMES)) {
258 this.allowedSchemes = Collections.emptySet();
259 } else {
260 if (schemes == null) {
261 schemes = DEFAULT_SCHEMES;
262 }
263 this.allowedSchemes = new HashSet<>();
264 this.allowedSchemes.addAll(Arrays.asList(schemes));
265 }
266
267 this.authorityValidator = authorityValidator;
268
269 }
270
271 /**
272 * <p>Checks if a field has a valid url address.</p>
273 *
274 * @param value The value validation is being performed on. A <code>null</code>
275 * value is considered invalid.
276 * @return true if the url is valid.
277 */
278 @Override
279 public boolean isValid(String value) {
280 if (value == null) {
281 return false;
282 }
283
284 if (!ASCII_PATTERN.matcher(value).matches()) {
285 setErrorMessage(tr("URL contains non-ascii characters"));
286 setFix(value.replaceAll("[^\\p{ASCII}]+", ""));
287 return false;
288 }
289
290 // Check the whole url address structure
291 Matcher urlMatcher = URL_PATTERN.matcher(value);
292 if (!urlMatcher.matches()) {
293 setErrorMessage(tr("URL is invalid"));
294 return false;
295 }
296
297 String scheme = urlMatcher.group(PARSE_URL_SCHEME);
298 if (!isValidScheme(scheme)) {
299 setErrorMessage(tr("URL contains an invalid protocol: {0}", scheme));
300 return false;
301 }
302
303 String authority = urlMatcher.group(PARSE_URL_AUTHORITY);
304 if ("file".equals(scheme) && "".equals(authority)) {
305 // Special case - file: allows an empty authority
306 } else {
307 // Validate the authority
308 if (!isValidAuthority(authority)) {
309 setErrorMessage(tr("URL contains an invalid authority: {0}", authority));
310 return false;
311 }
312 }
313
314 String path = urlMatcher.group(PARSE_URL_PATH);
315 if (!isValidPath(path)) {
316 setErrorMessage(tr("URL contains an invalid path: {0}", path));
317 return false;
318 }
319
320 String query = urlMatcher.group(PARSE_URL_QUERY);
321 if (!isValidQuery(query)) {
322 setErrorMessage(tr("URL contains an invalid query: {0}", query));
323 return false;
324 }
325
326 String fragment = urlMatcher.group(PARSE_URL_FRAGMENT);
327 if (!isValidFragment(fragment)) {
328 setErrorMessage(tr("URL contains an invalid fragment: {0}", fragment));
329 return false;
330 }
331
332 return true;
333 }
334
335 /**
336 * Validate scheme. If schemes[] was initialized to a non null,
337 * then only those scheme's are allowed. Note this is slightly different
338 * than for the constructor.
339 * @param scheme The scheme to validate. A <code>null</code> value is considered
340 * invalid.
341 * @return true if valid.
342 */
343 protected boolean isValidScheme(String scheme) {
344 if (scheme == null) {
345 return false;
346 }
347
348 if (!SCHEME_PATTERN.matcher(scheme).matches()) {
349 return false;
350 }
351
352 if (isOff(ALLOW_ALL_SCHEMES)) {
353
354 if (!this.allowedSchemes.contains(scheme)) {
355 return false;
356 }
357 }
358
359 return true;
360 }
361
362 /**
363 * Returns true if the authority is properly formatted. An authority is the combination
364 * of hostname and port. A <code>null</code> authority value is considered invalid.
365 * @param authority Authority value to validate.
366 * @return true if authority (hostname and port) is valid.
367 */
368 protected boolean isValidAuthority(String authority) {
369 if (authority == null) {
370 return false;
371 }
372
373 // check manual authority validation if specified
374 if (authorityValidator != null) {
375 if (authorityValidator.isValid(authority)) {
376 return true;
377 }
378 }
379
380 Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authority);
381 if (!authorityMatcher.matches()) {
382 return false;
383 }
384
385 String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
386 // check if authority is hostname or IP address:
387 // try a hostname first since that's much more likely
388 DomainValidator domainValidator = DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS));
389 if (!domainValidator.isValid(hostLocation)) {
390 // try an IP address
391 InetAddressValidator inetAddressValidator =
392 InetAddressValidator.getInstance();
393 if (!inetAddressValidator.isValid(hostLocation)) {
394 // isn't either one, so the URL is invalid
395 return false;
396 }
397 }
398
399 String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
400 if (port != null) {
401 if (!PORT_PATTERN.matcher(port).matches()) {
402 return false;
403 }
404 }
405
406 String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
407 if (extra != null && extra.trim().length() > 0){
408 return false;
409 }
410
411 return true;
412 }
413
414 /**
415 * Returns true if the path is valid. A <code>null</code> value is considered invalid.
416 * @param path Path value to validate.
417 * @return true if path is valid.
418 */
419 protected boolean isValidPath(String path) {
420 if (path == null) {
421 return false;
422 }
423
424 if (!PATH_PATTERN.matcher(path).matches()) {
425 return false;
426 }
427
428 int slash2Count = countToken("//", path);
429 if (isOff(ALLOW_2_SLASHES) && (slash2Count > 0)) {
430 return false;
431 }
432
433 int slashCount = countToken("/", path);
434 int dot2Count = countToken("..", path);
435 if (dot2Count > 0) {
436 if ((slashCount - slash2Count - 1) <= dot2Count) {
437 return false;
438 }
439 }
440
441 return true;
442 }
443
444 /**
445 * Returns true if the query is null or it's a properly formatted query string.
446 * @param query Query value to validate.
447 * @return true if query is valid.
448 */
449 protected boolean isValidQuery(String query) {
450 if (query == null) {
451 return true;
452 }
453
454 return QUERY_PATTERN.matcher(query).matches();
455 }
456
457 /**
458 * Returns true if the given fragment is null or fragments are allowed.
459 * @param fragment Fragment value to validate.
460 * @return true if fragment is valid.
461 */
462 protected boolean isValidFragment(String fragment) {
463 if (fragment == null) {
464 return true;
465 }
466
467 return isOff(NO_FRAGMENTS);
468 }
469
470 /**
471 * Returns the number of times the token appears in the target.
472 * @param token Token value to be counted.
473 * @param target Target value to count tokens in.
474 * @return the number of tokens.
475 */
476 protected int countToken(String token, String target) {
477 int tokenIndex = 0;
478 int count = 0;
479 while (tokenIndex != -1) {
480 tokenIndex = target.indexOf(token, tokenIndex);
481 if (tokenIndex > -1) {
482 tokenIndex++;
483 count++;
484 }
485 }
486 return count;
487 }
488
489 /**
490 * Tests whether the given flag is on. If the flag is not a power of 2
491 * (ie. 3) this tests whether the combination of flags is on.
492 *
493 * @param flag Flag value to check.
494 *
495 * @return whether the specified flag value is on.
496 */
497 private boolean isOn(long flag) {
498 return (this.options & flag) > 0;
499 }
500
501 /**
502 * Tests whether the given flag is off. If the flag is not a power of 2
503 * (ie. 3) this tests whether the combination of flags is off.
504 *
505 * @param flag Flag value to check.
506 *
507 * @return whether the specified flag value is off.
508 */
509 private boolean isOff(long flag) {
510 return (this.options & flag) == 0;
511 }
512}
Note: See TracBrowser for help on using the repository browser.