source: josm/trunk/test/unit/org/openstreetmap/josm/data/validation/routines/DomainValidatorTestIT.java@ 10756

Last change on this file since 10756 was 10756, checked in by Don-vip, 8 years ago

add robustness to DomainValidatorTestIT

  • Property svn:eol-style set to native
File size: 17.1 KB
Line 
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17package org.openstreetmap.josm.data.validation.routines;
18
19import static org.junit.Assert.assertTrue;
20import static org.junit.Assert.fail;
21
22import java.io.BufferedReader;
23import java.io.Closeable;
24import java.io.File;
25import java.io.FileInputStream;
26import java.io.FileOutputStream;
27import java.io.IOException;
28import java.io.InputStream;
29import java.io.InputStreamReader;
30import java.lang.reflect.Field;
31import java.lang.reflect.Modifier;
32import java.net.ConnectException;
33import java.net.HttpURLConnection;
34import java.net.IDN;
35import java.net.URL;
36import java.nio.charset.StandardCharsets;
37import java.text.SimpleDateFormat;
38import java.util.Date;
39import java.util.HashMap;
40import java.util.HashSet;
41import java.util.Iterator;
42import java.util.Locale;
43import java.util.Map;
44import java.util.Set;
45import java.util.TreeMap;
46import java.util.regex.Matcher;
47import java.util.regex.Pattern;
48
49import org.junit.Test;
50import org.openstreetmap.josm.Main;
51
52import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
53
54/**
55 * Integration tests for the DomainValidator.
56 *
57 * @version $Revision: 1723861 $
58 */
59public class DomainValidatorTestIT {
60
61 /**
62 * Download and process local copy of http://data.iana.org/TLD/tlds-alpha-by-domain.txt
63 * Check if the internal TLD table is up to date
64 * Check if the internal TLD tables have any spurious entries
65 * @throws Exception if an error occurs
66 */
67 @Test
68 public void testIanaTldList() throws Exception {
69 // Check the arrays first as this affects later checks
70 // Doing this here makes it easier when updating the lists
71 boolean OK = true;
72 for (String list : new String[]{"INFRASTRUCTURE_TLDS", "COUNTRY_CODE_TLDS", "GENERIC_TLDS", "LOCAL_TLDS"}) {
73 OK &= isSortedLowerCase(list);
74 }
75 if (!OK) {
76 System.out.println("Fix arrays before retrying; cannot continue");
77 return;
78 }
79 Set<String> ianaTlds = new HashSet<>(); // keep for comparison with array contents
80 DomainValidator dv = DomainValidator.getInstance();
81 File txtFile = new File(System.getProperty("java.io.tmpdir"), "tlds-alpha-by-domain.txt");
82 long timestamp;
83 try {
84 timestamp = download(txtFile, "http://data.iana.org/TLD/tlds-alpha-by-domain.txt", 0L);
85 } catch (ConnectException e) {
86 Main.error(e);
87 // Try again one more time in case of random network issue
88 timestamp = download(txtFile, "http://data.iana.org/TLD/tlds-alpha-by-domain.txt", 0L);
89 }
90 final File htmlFile = new File(System.getProperty("java.io.tmpdir"), "tlds-alpha-by-domain.html");
91 // N.B. sometimes the html file may be updated a day or so after the txt file
92 // if the txt file contains entries not found in the html file, try again in a day or two
93 download(htmlFile, "http://www.iana.org/domains/root/db", timestamp);
94
95 try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(txtFile), StandardCharsets.UTF_8))) {
96 String line;
97 final String header;
98 line = br.readLine(); // header
99 if (line != null && line.startsWith("# Version ")) {
100 header = line.substring(2);
101 } else {
102 throw new IOException("File does not have expected Version header");
103 }
104 final boolean generateUnicodeTlds = false; // Change this to generate Unicode TLDs as well
105
106 // Parse html page to get entries
107 Map<String, String[]> htmlInfo = getHtmlInfo(htmlFile);
108 Map<String, String> missingTLD = new TreeMap<>(); // stores entry and comments as String[]
109 Map<String, String> missingCC = new TreeMap<>();
110 while ((line = br.readLine()) != null) {
111 if (!line.startsWith("#")) {
112 final String unicodeTld; // only different from asciiTld if that was punycode
113 final String asciiTld = line.toLowerCase(Locale.ENGLISH);
114 if (line.startsWith("XN--")) {
115 unicodeTld = IDN.toUnicode(line);
116 } else {
117 unicodeTld = asciiTld;
118 }
119 if (!dv.isValidTld(asciiTld)) {
120 String[] info = htmlInfo.get(asciiTld);
121 if (info != null) {
122 String type = info[0];
123 String comment = info[1];
124 if ("country-code".equals(type)) { // Which list to use?
125 missingCC.put(asciiTld, unicodeTld + " " + comment);
126 if (generateUnicodeTlds) {
127 missingCC.put(unicodeTld, asciiTld + " " + comment);
128 }
129 } else {
130 missingTLD.put(asciiTld, unicodeTld + " " + comment);
131 if (generateUnicodeTlds) {
132 missingTLD.put(unicodeTld, asciiTld + " " + comment);
133 }
134 }
135 } else {
136 System.err.println("Expected to find HTML info for "+ asciiTld);
137 }
138 }
139 ianaTlds.add(asciiTld);
140 // Don't merge these conditions; generateUnicodeTlds is final so needs to be separate to avoid a warning
141 if (generateUnicodeTlds) {
142 if (!unicodeTld.equals(asciiTld)) {
143 ianaTlds.add(unicodeTld);
144 }
145 }
146 }
147 }
148 // List html entries not in TLD text list
149 for (String key : (new TreeMap<>(htmlInfo)).keySet()) {
150 if (!ianaTlds.contains(key)) {
151 if (isNotInRootZone(key)) {
152 System.out.println("INFO: HTML entry not yet in root zone: "+key);
153 } else {
154 System.err.println("WARN: Expected to find text entry for html: "+key);
155 }
156 }
157 }
158 if (!missingTLD.isEmpty()) {
159 printMap(header, missingTLD, "TLD");
160 fail("missing TLD");
161 }
162 if (!missingCC.isEmpty()) {
163 printMap(header, missingCC, "CC");
164 fail("missing CC");
165 }
166 }
167 // Check if internal tables contain any additional entries
168 assertTrue(isInIanaList("INFRASTRUCTURE_TLDS", ianaTlds));
169 assertTrue(isInIanaList("COUNTRY_CODE_TLDS", ianaTlds));
170 assertTrue(isInIanaList("GENERIC_TLDS", ianaTlds));
171 // Don't check local TLDS assertTrue(isInIanaList("LOCAL_TLDS", ianaTlds));
172 }
173
174 private static void printMap(final String header, Map<String, String> map, String string) {
175 System.out.println("Entries missing from "+ string +" List\n");
176 if (header != null) {
177 System.out.println(" // Taken from " + header);
178 }
179 Iterator<Map.Entry<String, String>> it = map.entrySet().iterator();
180 while (it.hasNext()) {
181 Map.Entry<String, String> me = it.next();
182 System.out.println(" \"" + me.getKey() + "\", // " + me.getValue());
183 }
184 System.out.println("\nDone");
185 }
186
187 @SuppressFBWarnings(value = "PERFORMANCE")
188 private static Map<String, String[]> getHtmlInfo(final File f) throws IOException {
189 final Map<String, String[]> info = new HashMap<>();
190
191// <td><span class="domain tld"><a href="/domains/root/db/ax.html">.ax</a></span></td>
192 final Pattern domain = Pattern.compile(".*<a href=\"/domains/root/db/([^.]+)\\.html");
193// <td>country-code</td>
194 final Pattern type = Pattern.compile("\\s+<td>([^<]+)</td>");
195// <!-- <td>Åland Islands<br/><span class="tld-table-so">Ålands landskapsregering</span></td> </td> -->
196// <td>Ålands landskapsregering</td>
197 final Pattern comment = Pattern.compile("\\s+<td>([^<]+)</td>");
198
199 try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), StandardCharsets.UTF_8))) {
200 String line;
201 while ((line = br.readLine()) != null) {
202 Matcher m = domain.matcher(line);
203 if (m.lookingAt()) {
204 String dom = m.group(1);
205 String typ = "??";
206 String com = "??";
207 line = br.readLine();
208 while (line != null && line.matches("^\\s*$")) { // extra blank lines introduced
209 line = br.readLine();
210 }
211 Matcher t = type.matcher(line);
212 if (t.lookingAt()) {
213 typ = t.group(1);
214 line = br.readLine();
215 if (line != null && line.matches("\\s+<!--.*")) {
216 while (line != null && !line.matches(".*-->.*")) {
217 line = br.readLine();
218 }
219 line = br.readLine();
220 }
221 // Should have comment; is it wrapped?
222 while (line != null && !line.matches(".*</td>.*")) {
223 line += " " +br.readLine();
224 }
225 Matcher n = comment.matcher(line);
226 if (n.lookingAt()) {
227 com = n.group(1);
228 }
229 // Don't save unused entries
230 if (com.contains("Not assigned") || com.contains("Retired") || typ.equals("test")) {
231 // System.out.println("Ignored: " + typ + " " + dom + " " +com);
232 } else {
233 info.put(dom.toLowerCase(Locale.ENGLISH), new String[]{typ, com});
234 // System.out.println("Storing: " + typ + " " + dom + " " +com);
235 }
236 } else {
237 System.err.println("Unexpected type: " + line);
238 }
239 }
240 }
241 }
242 return info;
243 }
244
245 /*
246 * Download a file if it is more recent than our cached copy.
247 * Unfortunately the server does not seem to honour If-Modified-Since for the
248 * Html page, so we check if it is newer than the txt file and skip download if so
249 */
250 private static long download(File f, String tldurl, long timestamp) throws IOException {
251 final int HOUR = 60*60*1000; // an hour in ms
252 final long modTime;
253 // For testing purposes, don't download files more than once an hour
254 if (f.canRead()) {
255 modTime = f.lastModified();
256 if (modTime > System.currentTimeMillis()-HOUR) {
257 System.out.println("Skipping download - found recent " + f);
258 return modTime;
259 }
260 } else {
261 modTime = 0;
262 }
263 HttpURLConnection hc = (HttpURLConnection) new URL(tldurl).openConnection();
264 if (modTime > 0) {
265 SimpleDateFormat sdf = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z"); //Sun, 06 Nov 1994 08:49:37 GMT
266 String since = sdf.format(new Date(modTime));
267 hc.addRequestProperty("If-Modified-Since", since);
268 System.out.println("Found " + f + " with date " + since);
269 }
270 if (hc.getResponseCode() == 304) {
271 System.out.println("Already have most recent " + tldurl);
272 } else {
273 System.out.println("Downloading " + tldurl);
274 byte[] buff = new byte[1024];
275 try (InputStream is = hc.getInputStream();
276 FileOutputStream fos = new FileOutputStream(f)) {
277 int len;
278 while ((len = is.read(buff)) != -1) {
279 fos.write(buff, 0, len);
280 }
281 }
282 System.out.println("Done");
283 }
284 return f.lastModified();
285 }
286
287 /**
288 * Check whether the domain is in the root zone currently.
289 * Reads the URL http://www.iana.org/domains/root/db/*domain*.html
290 * (using a local disk cache)
291 * and checks for the string "This domain is not present in the root zone at this time."
292 * @param domain the domain to check
293 * @return true if the string is found
294 */
295 private static boolean isNotInRootZone(String domain) {
296 String tldurl = "http://www.iana.org/domains/root/db/" + domain + ".html";
297 BufferedReader in = null;
298 try {
299 File rootCheck = new File(System.getProperty("java.io.tmpdir"), "tld_" + domain + ".html");
300 download(rootCheck, tldurl, 0L);
301 in = new BufferedReader(new InputStreamReader(new FileInputStream(rootCheck), StandardCharsets.UTF_8));
302 String inputLine;
303 while ((inputLine = in.readLine()) != null) {
304 if (inputLine.contains("This domain is not present in the root zone at this time.")) {
305 return true;
306 }
307 }
308 in.close();
309 } catch (IOException e) {
310 e.printStackTrace();
311 } finally {
312 closeQuietly(in);
313 }
314 return false;
315 }
316
317 private static void closeQuietly(Closeable in) {
318 if (in != null) {
319 try {
320 in.close();
321 } catch (IOException e) {
322 e.printStackTrace();
323 }
324 }
325 }
326
327 // isInIanaList and isSorted are split into two methods.
328 // If/when access to the arrays is possible without reflection, the intermediate
329 // methods can be dropped
330 private static boolean isInIanaList(String arrayName, Set<String> ianaTlds) throws Exception {
331 Field f = DomainValidator.class.getDeclaredField(arrayName);
332 final boolean isPrivate = Modifier.isPrivate(f.getModifiers());
333 if (isPrivate) {
334 f.setAccessible(true);
335 }
336 String[] array = (String[]) f.get(null);
337 try {
338 return isInIanaList(arrayName, array, ianaTlds);
339 } finally {
340 if (isPrivate) {
341 f.setAccessible(false);
342 }
343 }
344 }
345
346 private static boolean isInIanaList(String name, String[] array, Set<String> ianaTlds) {
347 boolean ok = true;
348 for (int i = 0; i < array.length; i++) {
349 if (!ianaTlds.contains(array[i])) {
350 System.out.println(name + " contains unexpected value: " + array[i]);
351 ok = false;
352 }
353 }
354 return ok;
355 }
356
357 private static boolean isSortedLowerCase(String arrayName) throws Exception {
358 Field f = DomainValidator.class.getDeclaredField(arrayName);
359 final boolean isPrivate = Modifier.isPrivate(f.getModifiers());
360 if (isPrivate) {
361 f.setAccessible(true);
362 }
363 String[] array = (String[]) f.get(null);
364 try {
365 return isSortedLowerCase(arrayName, array);
366 } finally {
367 if (isPrivate) {
368 f.setAccessible(false);
369 }
370 }
371 }
372
373 private static boolean isLowerCase(String string) {
374 return string.equals(string.toLowerCase(Locale.ENGLISH));
375 }
376
377 // Check if an array is strictly sorted - and lowerCase
378 private static boolean isSortedLowerCase(String name, String[] array) {
379 boolean sorted = true;
380 boolean strictlySorted = true;
381 final int length = array.length;
382 boolean lowerCase = isLowerCase(array[length-1]); // Check the last entry
383 for (int i = 0; i < length-1; i++) { // compare all but last entry with next
384 final String entry = array[i];
385 final String nextEntry = array[i+1];
386 final int cmp = entry.compareTo(nextEntry);
387 if (cmp > 0) { // out of order
388 System.out.println("Out of order entry: " + entry + " < " + nextEntry + " in " + name);
389 sorted = false;
390 } else if (cmp == 0) {
391 strictlySorted = false;
392 System.out.println("Duplicated entry: " + entry + " in " + name);
393 }
394 if (!isLowerCase(entry)) {
395 System.out.println("Non lowerCase entry: " + entry + " in " + name);
396 lowerCase = false;
397 }
398 }
399 return sorted && strictlySorted && lowerCase;
400 }
401}
Note: See TracBrowser for help on using the repository browser.