1 | /*
|
---|
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
|
---|
3 | * contributor license agreements. See the NOTICE file distributed with
|
---|
4 | * this work for additional information regarding copyright ownership.
|
---|
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
|
---|
6 | * (the "License"); you may not use this file except in compliance with
|
---|
7 | * the License. You may obtain a copy of the License at
|
---|
8 | *
|
---|
9 | * http://www.apache.org/licenses/LICENSE-2.0
|
---|
10 | *
|
---|
11 | * Unless required by applicable law or agreed to in writing, software
|
---|
12 | * distributed under the License is distributed on an "AS IS" BASIS,
|
---|
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
14 | * See the License for the specific language governing permissions and
|
---|
15 | * limitations under the License.
|
---|
16 | */
|
---|
17 | package org.openstreetmap.josm.data.validation.routines;
|
---|
18 |
|
---|
19 | import static org.junit.Assert.assertTrue;
|
---|
20 | import static org.junit.Assert.fail;
|
---|
21 |
|
---|
22 | import java.io.BufferedReader;
|
---|
23 | import java.io.Closeable;
|
---|
24 | import java.io.File;
|
---|
25 | import java.io.FileInputStream;
|
---|
26 | import java.io.FileOutputStream;
|
---|
27 | import java.io.IOException;
|
---|
28 | import java.io.InputStream;
|
---|
29 | import java.io.InputStreamReader;
|
---|
30 | import java.lang.reflect.Field;
|
---|
31 | import java.lang.reflect.Modifier;
|
---|
32 | import java.net.ConnectException;
|
---|
33 | import java.net.HttpURLConnection;
|
---|
34 | import java.net.IDN;
|
---|
35 | import java.net.URL;
|
---|
36 | import java.nio.charset.StandardCharsets;
|
---|
37 | import java.text.SimpleDateFormat;
|
---|
38 | import java.util.Date;
|
---|
39 | import java.util.HashMap;
|
---|
40 | import java.util.HashSet;
|
---|
41 | import java.util.Iterator;
|
---|
42 | import java.util.Locale;
|
---|
43 | import java.util.Map;
|
---|
44 | import java.util.Set;
|
---|
45 | import java.util.TreeMap;
|
---|
46 | import java.util.regex.Matcher;
|
---|
47 | import java.util.regex.Pattern;
|
---|
48 |
|
---|
49 | import org.junit.Test;
|
---|
50 | import org.openstreetmap.josm.Main;
|
---|
51 |
|
---|
52 | import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
|
---|
53 |
|
---|
54 | /**
|
---|
55 | * Integration tests for the DomainValidator.
|
---|
56 | *
|
---|
57 | * @version $Revision: 1723861 $
|
---|
58 | */
|
---|
59 | public class DomainValidatorTestIT {
|
---|
60 |
|
---|
61 | /**
|
---|
62 | * Download and process local copy of http://data.iana.org/TLD/tlds-alpha-by-domain.txt
|
---|
63 | * Check if the internal TLD table is up to date
|
---|
64 | * Check if the internal TLD tables have any spurious entries
|
---|
65 | * @throws Exception if an error occurs
|
---|
66 | */
|
---|
67 | @Test
|
---|
68 | public void testIanaTldList() throws Exception {
|
---|
69 | // Check the arrays first as this affects later checks
|
---|
70 | // Doing this here makes it easier when updating the lists
|
---|
71 | boolean OK = true;
|
---|
72 | for (String list : new String[]{"INFRASTRUCTURE_TLDS", "COUNTRY_CODE_TLDS", "GENERIC_TLDS", "LOCAL_TLDS"}) {
|
---|
73 | OK &= isSortedLowerCase(list);
|
---|
74 | }
|
---|
75 | if (!OK) {
|
---|
76 | System.out.println("Fix arrays before retrying; cannot continue");
|
---|
77 | return;
|
---|
78 | }
|
---|
79 | Set<String> ianaTlds = new HashSet<>(); // keep for comparison with array contents
|
---|
80 | DomainValidator dv = DomainValidator.getInstance();
|
---|
81 | File txtFile = new File(System.getProperty("java.io.tmpdir"), "tlds-alpha-by-domain.txt");
|
---|
82 | long timestamp;
|
---|
83 | try {
|
---|
84 | timestamp = download(txtFile, "http://data.iana.org/TLD/tlds-alpha-by-domain.txt", 0L);
|
---|
85 | } catch (ConnectException e) {
|
---|
86 | Main.error(e);
|
---|
87 | // Try again one more time in case of random network issue
|
---|
88 | timestamp = download(txtFile, "http://data.iana.org/TLD/tlds-alpha-by-domain.txt", 0L);
|
---|
89 | }
|
---|
90 | final File htmlFile = new File(System.getProperty("java.io.tmpdir"), "tlds-alpha-by-domain.html");
|
---|
91 | // N.B. sometimes the html file may be updated a day or so after the txt file
|
---|
92 | // if the txt file contains entries not found in the html file, try again in a day or two
|
---|
93 | download(htmlFile, "http://www.iana.org/domains/root/db", timestamp);
|
---|
94 |
|
---|
95 | try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(txtFile), StandardCharsets.UTF_8))) {
|
---|
96 | String line;
|
---|
97 | final String header;
|
---|
98 | line = br.readLine(); // header
|
---|
99 | if (line != null && line.startsWith("# Version ")) {
|
---|
100 | header = line.substring(2);
|
---|
101 | } else {
|
---|
102 | throw new IOException("File does not have expected Version header");
|
---|
103 | }
|
---|
104 | final boolean generateUnicodeTlds = false; // Change this to generate Unicode TLDs as well
|
---|
105 |
|
---|
106 | // Parse html page to get entries
|
---|
107 | Map<String, String[]> htmlInfo = getHtmlInfo(htmlFile);
|
---|
108 | Map<String, String> missingTLD = new TreeMap<>(); // stores entry and comments as String[]
|
---|
109 | Map<String, String> missingCC = new TreeMap<>();
|
---|
110 | while ((line = br.readLine()) != null) {
|
---|
111 | if (!line.startsWith("#")) {
|
---|
112 | final String unicodeTld; // only different from asciiTld if that was punycode
|
---|
113 | final String asciiTld = line.toLowerCase(Locale.ENGLISH);
|
---|
114 | if (line.startsWith("XN--")) {
|
---|
115 | unicodeTld = IDN.toUnicode(line);
|
---|
116 | } else {
|
---|
117 | unicodeTld = asciiTld;
|
---|
118 | }
|
---|
119 | if (!dv.isValidTld(asciiTld)) {
|
---|
120 | String[] info = htmlInfo.get(asciiTld);
|
---|
121 | if (info != null) {
|
---|
122 | String type = info[0];
|
---|
123 | String comment = info[1];
|
---|
124 | if ("country-code".equals(type)) { // Which list to use?
|
---|
125 | missingCC.put(asciiTld, unicodeTld + " " + comment);
|
---|
126 | if (generateUnicodeTlds) {
|
---|
127 | missingCC.put(unicodeTld, asciiTld + " " + comment);
|
---|
128 | }
|
---|
129 | } else {
|
---|
130 | missingTLD.put(asciiTld, unicodeTld + " " + comment);
|
---|
131 | if (generateUnicodeTlds) {
|
---|
132 | missingTLD.put(unicodeTld, asciiTld + " " + comment);
|
---|
133 | }
|
---|
134 | }
|
---|
135 | } else {
|
---|
136 | System.err.println("Expected to find HTML info for "+ asciiTld);
|
---|
137 | }
|
---|
138 | }
|
---|
139 | ianaTlds.add(asciiTld);
|
---|
140 | // Don't merge these conditions; generateUnicodeTlds is final so needs to be separate to avoid a warning
|
---|
141 | if (generateUnicodeTlds) {
|
---|
142 | if (!unicodeTld.equals(asciiTld)) {
|
---|
143 | ianaTlds.add(unicodeTld);
|
---|
144 | }
|
---|
145 | }
|
---|
146 | }
|
---|
147 | }
|
---|
148 | // List html entries not in TLD text list
|
---|
149 | for (String key : (new TreeMap<>(htmlInfo)).keySet()) {
|
---|
150 | if (!ianaTlds.contains(key)) {
|
---|
151 | if (isNotInRootZone(key)) {
|
---|
152 | System.out.println("INFO: HTML entry not yet in root zone: "+key);
|
---|
153 | } else {
|
---|
154 | System.err.println("WARN: Expected to find text entry for html: "+key);
|
---|
155 | }
|
---|
156 | }
|
---|
157 | }
|
---|
158 | if (!missingTLD.isEmpty()) {
|
---|
159 | printMap(header, missingTLD, "TLD");
|
---|
160 | fail("missing TLD");
|
---|
161 | }
|
---|
162 | if (!missingCC.isEmpty()) {
|
---|
163 | printMap(header, missingCC, "CC");
|
---|
164 | fail("missing CC");
|
---|
165 | }
|
---|
166 | }
|
---|
167 | // Check if internal tables contain any additional entries
|
---|
168 | assertTrue(isInIanaList("INFRASTRUCTURE_TLDS", ianaTlds));
|
---|
169 | assertTrue(isInIanaList("COUNTRY_CODE_TLDS", ianaTlds));
|
---|
170 | assertTrue(isInIanaList("GENERIC_TLDS", ianaTlds));
|
---|
171 | // Don't check local TLDS assertTrue(isInIanaList("LOCAL_TLDS", ianaTlds));
|
---|
172 | }
|
---|
173 |
|
---|
174 | private static void printMap(final String header, Map<String, String> map, String string) {
|
---|
175 | System.out.println("Entries missing from "+ string +" List\n");
|
---|
176 | if (header != null) {
|
---|
177 | System.out.println(" // Taken from " + header);
|
---|
178 | }
|
---|
179 | Iterator<Map.Entry<String, String>> it = map.entrySet().iterator();
|
---|
180 | while (it.hasNext()) {
|
---|
181 | Map.Entry<String, String> me = it.next();
|
---|
182 | System.out.println(" \"" + me.getKey() + "\", // " + me.getValue());
|
---|
183 | }
|
---|
184 | System.out.println("\nDone");
|
---|
185 | }
|
---|
186 |
|
---|
187 | @SuppressFBWarnings(value = "PERFORMANCE")
|
---|
188 | private static Map<String, String[]> getHtmlInfo(final File f) throws IOException {
|
---|
189 | final Map<String, String[]> info = new HashMap<>();
|
---|
190 |
|
---|
191 | // <td><span class="domain tld"><a href="/domains/root/db/ax.html">.ax</a></span></td>
|
---|
192 | final Pattern domain = Pattern.compile(".*<a href=\"/domains/root/db/([^.]+)\\.html");
|
---|
193 | // <td>country-code</td>
|
---|
194 | final Pattern type = Pattern.compile("\\s+<td>([^<]+)</td>");
|
---|
195 | // <!-- <td>Åland Islands<br/><span class="tld-table-so">Ålands landskapsregering</span></td> </td> -->
|
---|
196 | // <td>Ålands landskapsregering</td>
|
---|
197 | final Pattern comment = Pattern.compile("\\s+<td>([^<]+)</td>");
|
---|
198 |
|
---|
199 | try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), StandardCharsets.UTF_8))) {
|
---|
200 | String line;
|
---|
201 | while ((line = br.readLine()) != null) {
|
---|
202 | Matcher m = domain.matcher(line);
|
---|
203 | if (m.lookingAt()) {
|
---|
204 | String dom = m.group(1);
|
---|
205 | String typ = "??";
|
---|
206 | String com = "??";
|
---|
207 | line = br.readLine();
|
---|
208 | while (line != null && line.matches("^\\s*$")) { // extra blank lines introduced
|
---|
209 | line = br.readLine();
|
---|
210 | }
|
---|
211 | Matcher t = type.matcher(line);
|
---|
212 | if (t.lookingAt()) {
|
---|
213 | typ = t.group(1);
|
---|
214 | line = br.readLine();
|
---|
215 | if (line != null && line.matches("\\s+<!--.*")) {
|
---|
216 | while (line != null && !line.matches(".*-->.*")) {
|
---|
217 | line = br.readLine();
|
---|
218 | }
|
---|
219 | line = br.readLine();
|
---|
220 | }
|
---|
221 | // Should have comment; is it wrapped?
|
---|
222 | while (line != null && !line.matches(".*</td>.*")) {
|
---|
223 | line += " " +br.readLine();
|
---|
224 | }
|
---|
225 | Matcher n = comment.matcher(line);
|
---|
226 | if (n.lookingAt()) {
|
---|
227 | com = n.group(1);
|
---|
228 | }
|
---|
229 | // Don't save unused entries
|
---|
230 | if (com.contains("Not assigned") || com.contains("Retired") || typ.equals("test")) {
|
---|
231 | // System.out.println("Ignored: " + typ + " " + dom + " " +com);
|
---|
232 | } else {
|
---|
233 | info.put(dom.toLowerCase(Locale.ENGLISH), new String[]{typ, com});
|
---|
234 | // System.out.println("Storing: " + typ + " " + dom + " " +com);
|
---|
235 | }
|
---|
236 | } else {
|
---|
237 | System.err.println("Unexpected type: " + line);
|
---|
238 | }
|
---|
239 | }
|
---|
240 | }
|
---|
241 | }
|
---|
242 | return info;
|
---|
243 | }
|
---|
244 |
|
---|
245 | /*
|
---|
246 | * Download a file if it is more recent than our cached copy.
|
---|
247 | * Unfortunately the server does not seem to honour If-Modified-Since for the
|
---|
248 | * Html page, so we check if it is newer than the txt file and skip download if so
|
---|
249 | */
|
---|
250 | private static long download(File f, String tldurl, long timestamp) throws IOException {
|
---|
251 | final int HOUR = 60*60*1000; // an hour in ms
|
---|
252 | final long modTime;
|
---|
253 | // For testing purposes, don't download files more than once an hour
|
---|
254 | if (f.canRead()) {
|
---|
255 | modTime = f.lastModified();
|
---|
256 | if (modTime > System.currentTimeMillis()-HOUR) {
|
---|
257 | System.out.println("Skipping download - found recent " + f);
|
---|
258 | return modTime;
|
---|
259 | }
|
---|
260 | } else {
|
---|
261 | modTime = 0;
|
---|
262 | }
|
---|
263 | HttpURLConnection hc = (HttpURLConnection) new URL(tldurl).openConnection();
|
---|
264 | if (modTime > 0) {
|
---|
265 | SimpleDateFormat sdf = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z"); //Sun, 06 Nov 1994 08:49:37 GMT
|
---|
266 | String since = sdf.format(new Date(modTime));
|
---|
267 | hc.addRequestProperty("If-Modified-Since", since);
|
---|
268 | System.out.println("Found " + f + " with date " + since);
|
---|
269 | }
|
---|
270 | if (hc.getResponseCode() == 304) {
|
---|
271 | System.out.println("Already have most recent " + tldurl);
|
---|
272 | } else {
|
---|
273 | System.out.println("Downloading " + tldurl);
|
---|
274 | byte[] buff = new byte[1024];
|
---|
275 | try (InputStream is = hc.getInputStream();
|
---|
276 | FileOutputStream fos = new FileOutputStream(f)) {
|
---|
277 | int len;
|
---|
278 | while ((len = is.read(buff)) != -1) {
|
---|
279 | fos.write(buff, 0, len);
|
---|
280 | }
|
---|
281 | }
|
---|
282 | System.out.println("Done");
|
---|
283 | }
|
---|
284 | return f.lastModified();
|
---|
285 | }
|
---|
286 |
|
---|
287 | /**
|
---|
288 | * Check whether the domain is in the root zone currently.
|
---|
289 | * Reads the URL http://www.iana.org/domains/root/db/*domain*.html
|
---|
290 | * (using a local disk cache)
|
---|
291 | * and checks for the string "This domain is not present in the root zone at this time."
|
---|
292 | * @param domain the domain to check
|
---|
293 | * @return true if the string is found
|
---|
294 | */
|
---|
295 | private static boolean isNotInRootZone(String domain) {
|
---|
296 | String tldurl = "http://www.iana.org/domains/root/db/" + domain + ".html";
|
---|
297 | BufferedReader in = null;
|
---|
298 | try {
|
---|
299 | File rootCheck = new File(System.getProperty("java.io.tmpdir"), "tld_" + domain + ".html");
|
---|
300 | download(rootCheck, tldurl, 0L);
|
---|
301 | in = new BufferedReader(new InputStreamReader(new FileInputStream(rootCheck), StandardCharsets.UTF_8));
|
---|
302 | String inputLine;
|
---|
303 | while ((inputLine = in.readLine()) != null) {
|
---|
304 | if (inputLine.contains("This domain is not present in the root zone at this time.")) {
|
---|
305 | return true;
|
---|
306 | }
|
---|
307 | }
|
---|
308 | in.close();
|
---|
309 | } catch (IOException e) {
|
---|
310 | e.printStackTrace();
|
---|
311 | } finally {
|
---|
312 | closeQuietly(in);
|
---|
313 | }
|
---|
314 | return false;
|
---|
315 | }
|
---|
316 |
|
---|
317 | private static void closeQuietly(Closeable in) {
|
---|
318 | if (in != null) {
|
---|
319 | try {
|
---|
320 | in.close();
|
---|
321 | } catch (IOException e) {
|
---|
322 | e.printStackTrace();
|
---|
323 | }
|
---|
324 | }
|
---|
325 | }
|
---|
326 |
|
---|
327 | // isInIanaList and isSorted are split into two methods.
|
---|
328 | // If/when access to the arrays is possible without reflection, the intermediate
|
---|
329 | // methods can be dropped
|
---|
330 | private static boolean isInIanaList(String arrayName, Set<String> ianaTlds) throws Exception {
|
---|
331 | Field f = DomainValidator.class.getDeclaredField(arrayName);
|
---|
332 | final boolean isPrivate = Modifier.isPrivate(f.getModifiers());
|
---|
333 | if (isPrivate) {
|
---|
334 | f.setAccessible(true);
|
---|
335 | }
|
---|
336 | String[] array = (String[]) f.get(null);
|
---|
337 | try {
|
---|
338 | return isInIanaList(arrayName, array, ianaTlds);
|
---|
339 | } finally {
|
---|
340 | if (isPrivate) {
|
---|
341 | f.setAccessible(false);
|
---|
342 | }
|
---|
343 | }
|
---|
344 | }
|
---|
345 |
|
---|
346 | private static boolean isInIanaList(String name, String[] array, Set<String> ianaTlds) {
|
---|
347 | boolean ok = true;
|
---|
348 | for (int i = 0; i < array.length; i++) {
|
---|
349 | if (!ianaTlds.contains(array[i])) {
|
---|
350 | System.out.println(name + " contains unexpected value: " + array[i]);
|
---|
351 | ok = false;
|
---|
352 | }
|
---|
353 | }
|
---|
354 | return ok;
|
---|
355 | }
|
---|
356 |
|
---|
357 | private static boolean isSortedLowerCase(String arrayName) throws Exception {
|
---|
358 | Field f = DomainValidator.class.getDeclaredField(arrayName);
|
---|
359 | final boolean isPrivate = Modifier.isPrivate(f.getModifiers());
|
---|
360 | if (isPrivate) {
|
---|
361 | f.setAccessible(true);
|
---|
362 | }
|
---|
363 | String[] array = (String[]) f.get(null);
|
---|
364 | try {
|
---|
365 | return isSortedLowerCase(arrayName, array);
|
---|
366 | } finally {
|
---|
367 | if (isPrivate) {
|
---|
368 | f.setAccessible(false);
|
---|
369 | }
|
---|
370 | }
|
---|
371 | }
|
---|
372 |
|
---|
373 | private static boolean isLowerCase(String string) {
|
---|
374 | return string.equals(string.toLowerCase(Locale.ENGLISH));
|
---|
375 | }
|
---|
376 |
|
---|
377 | // Check if an array is strictly sorted - and lowerCase
|
---|
378 | private static boolean isSortedLowerCase(String name, String[] array) {
|
---|
379 | boolean sorted = true;
|
---|
380 | boolean strictlySorted = true;
|
---|
381 | final int length = array.length;
|
---|
382 | boolean lowerCase = isLowerCase(array[length-1]); // Check the last entry
|
---|
383 | for (int i = 0; i < length-1; i++) { // compare all but last entry with next
|
---|
384 | final String entry = array[i];
|
---|
385 | final String nextEntry = array[i+1];
|
---|
386 | final int cmp = entry.compareTo(nextEntry);
|
---|
387 | if (cmp > 0) { // out of order
|
---|
388 | System.out.println("Out of order entry: " + entry + " < " + nextEntry + " in " + name);
|
---|
389 | sorted = false;
|
---|
390 | } else if (cmp == 0) {
|
---|
391 | strictlySorted = false;
|
---|
392 | System.out.println("Duplicated entry: " + entry + " in " + name);
|
---|
393 | }
|
---|
394 | if (!isLowerCase(entry)) {
|
---|
395 | System.out.println("Non lowerCase entry: " + entry + " in " + name);
|
---|
396 | lowerCase = false;
|
---|
397 | }
|
---|
398 | }
|
---|
399 | return sorted && strictlySorted && lowerCase;
|
---|
400 | }
|
---|
401 | }
|
---|