1 | /*
|
---|
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
|
---|
3 | * contributor license agreements. See the NOTICE file distributed with
|
---|
4 | * this work for additional information regarding copyright ownership.
|
---|
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
|
---|
6 | * (the "License"); you may not use this file except in compliance with
|
---|
7 | * the License. You may obtain a copy of the License at
|
---|
8 | *
|
---|
9 | * http://www.apache.org/licenses/LICENSE-2.0
|
---|
10 | *
|
---|
11 | * Unless required by applicable law or agreed to in writing, software
|
---|
12 | * distributed under the License is distributed on an "AS IS" BASIS,
|
---|
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
14 | * See the License for the specific language governing permissions and
|
---|
15 | * limitations under the License.
|
---|
16 | */
|
---|
17 | package org.openstreetmap.josm.data.validation.routines;
|
---|
18 |
|
---|
19 | import static org.junit.jupiter.api.Assertions.assertTrue;
|
---|
20 | import static org.junit.jupiter.api.Assertions.fail;
|
---|
21 |
|
---|
22 | import java.io.BufferedReader;
|
---|
23 | import java.io.Closeable;
|
---|
24 | import java.io.File;
|
---|
25 | import java.io.FileInputStream;
|
---|
26 | import java.io.FileOutputStream;
|
---|
27 | import java.io.IOException;
|
---|
28 | import java.io.InputStream;
|
---|
29 | import java.io.InputStreamReader;
|
---|
30 | import java.lang.reflect.Field;
|
---|
31 | import java.lang.reflect.Modifier;
|
---|
32 | import java.net.ConnectException;
|
---|
33 | import java.net.HttpURLConnection;
|
---|
34 | import java.net.IDN;
|
---|
35 | import java.net.URL;
|
---|
36 | import java.nio.charset.StandardCharsets;
|
---|
37 | import java.text.SimpleDateFormat;
|
---|
38 | import java.util.Date;
|
---|
39 | import java.util.HashMap;
|
---|
40 | import java.util.HashSet;
|
---|
41 | import java.util.Iterator;
|
---|
42 | import java.util.Locale;
|
---|
43 | import java.util.Map;
|
---|
44 | import java.util.Set;
|
---|
45 | import java.util.TreeMap;
|
---|
46 | import java.util.regex.Matcher;
|
---|
47 | import java.util.regex.Pattern;
|
---|
48 |
|
---|
49 | import org.junit.jupiter.api.extension.RegisterExtension;
|
---|
50 | import org.junit.jupiter.api.Test;
|
---|
51 | import org.openstreetmap.josm.testutils.JOSMTestRules;
|
---|
52 | import org.openstreetmap.josm.tools.Logging;
|
---|
53 |
|
---|
54 | import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
|
---|
55 |
|
---|
56 | /**
|
---|
57 | * Integration tests for the DomainValidator.
|
---|
58 | *
|
---|
59 | * @version $Revision: 1723861 $
|
---|
60 | */
|
---|
61 | class DomainValidatorTestIT {
|
---|
62 |
|
---|
63 | /**
|
---|
64 | * Setup rule
|
---|
65 | */
|
---|
66 | @RegisterExtension
|
---|
67 | @SuppressFBWarnings(value = "URF_UNREAD_PUBLIC_OR_PROTECTED_FIELD")
|
---|
68 | public JOSMTestRules test = new JOSMTestRules().https();
|
---|
69 |
|
---|
70 | /**
|
---|
71 | * Download and process local copy of http://data.iana.org/TLD/tlds-alpha-by-domain.txt
|
---|
72 | * Check if the internal TLD table is up to date
|
---|
73 | * Check if the internal TLD tables have any spurious entries
|
---|
74 | * @throws Exception if an error occurs
|
---|
75 | */
|
---|
76 | @Test
|
---|
77 | void testIanaTldList() throws Exception {
|
---|
78 | // Check the arrays first as this affects later checks
|
---|
79 | // Doing this here makes it easier when updating the lists
|
---|
80 | boolean OK = true;
|
---|
81 | for (String list : new String[]{"INFRASTRUCTURE_TLDS", "COUNTRY_CODE_TLDS", "GENERIC_TLDS", "LOCAL_TLDS"}) {
|
---|
82 | OK &= isSortedLowerCase(list);
|
---|
83 | }
|
---|
84 | if (!OK) {
|
---|
85 | System.out.println("Fix arrays before retrying; cannot continue");
|
---|
86 | return;
|
---|
87 | }
|
---|
88 | Set<String> ianaTlds = new HashSet<>(); // keep for comparison with array contents
|
---|
89 | DomainValidator dv = DomainValidator.getInstance();
|
---|
90 | File txtFile = new File(System.getProperty("java.io.tmpdir"), "tlds-alpha-by-domain.txt");
|
---|
91 | long timestamp;
|
---|
92 | try {
|
---|
93 | timestamp = download(txtFile, "http://data.iana.org/TLD/tlds-alpha-by-domain.txt", 0L);
|
---|
94 | } catch (ConnectException e) {
|
---|
95 | Logging.error(e);
|
---|
96 | // Try again one more time in case of random network issue
|
---|
97 | timestamp = download(txtFile, "http://data.iana.org/TLD/tlds-alpha-by-domain.txt", 0L);
|
---|
98 | }
|
---|
99 | final File htmlFile = new File(System.getProperty("java.io.tmpdir"), "tlds-alpha-by-domain.html");
|
---|
100 | // N.B. sometimes the html file may be updated a day or so after the txt file
|
---|
101 | // if the txt file contains entries not found in the html file, try again in a day or two
|
---|
102 | download(htmlFile, "http://www.iana.org/domains/root/db", timestamp);
|
---|
103 |
|
---|
104 | try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(txtFile), StandardCharsets.UTF_8))) {
|
---|
105 | String line;
|
---|
106 | final String header;
|
---|
107 | line = br.readLine(); // header
|
---|
108 | if (line != null && line.startsWith("# Version ")) {
|
---|
109 | header = line.substring(2);
|
---|
110 | } else {
|
---|
111 | throw new IOException("File does not have expected Version header");
|
---|
112 | }
|
---|
113 | final boolean generateUnicodeTlds = false; // Change this to generate Unicode TLDs as well
|
---|
114 |
|
---|
115 | // Parse html page to get entries
|
---|
116 | Map<String, String[]> htmlInfo = getHtmlInfo(htmlFile);
|
---|
117 | Map<String, String> missingTLD = new TreeMap<>(); // stores entry and comments as String[]
|
---|
118 | Map<String, String> missingCC = new TreeMap<>();
|
---|
119 | while ((line = br.readLine()) != null) {
|
---|
120 | if (!line.startsWith("#")) {
|
---|
121 | final String unicodeTld; // only different from asciiTld if that was punycode
|
---|
122 | final String asciiTld = line.toLowerCase(Locale.ENGLISH);
|
---|
123 | if (line.startsWith("XN--")) {
|
---|
124 | unicodeTld = IDN.toUnicode(line);
|
---|
125 | } else {
|
---|
126 | unicodeTld = asciiTld;
|
---|
127 | }
|
---|
128 | if (!dv.isValidTld(asciiTld)) {
|
---|
129 | String[] info = htmlInfo.get(asciiTld);
|
---|
130 | if (info != null) {
|
---|
131 | String type = info[0];
|
---|
132 | String comment = info[1];
|
---|
133 | if ("country-code".equals(type)) { // Which list to use?
|
---|
134 | missingCC.put(asciiTld, unicodeTld + " " + comment);
|
---|
135 | if (generateUnicodeTlds) {
|
---|
136 | missingCC.put(unicodeTld, asciiTld + " " + comment);
|
---|
137 | }
|
---|
138 | } else {
|
---|
139 | missingTLD.put(asciiTld, unicodeTld + " " + comment);
|
---|
140 | if (generateUnicodeTlds) {
|
---|
141 | missingTLD.put(unicodeTld, asciiTld + " " + comment);
|
---|
142 | }
|
---|
143 | }
|
---|
144 | } else {
|
---|
145 | Logging.error("Expected to find HTML info for "+ asciiTld);
|
---|
146 | }
|
---|
147 | }
|
---|
148 | ianaTlds.add(asciiTld);
|
---|
149 | // Don't merge these conditions; generateUnicodeTlds is final so needs to be separate to avoid a warning
|
---|
150 | if (generateUnicodeTlds) {
|
---|
151 | if (!unicodeTld.equals(asciiTld)) {
|
---|
152 | ianaTlds.add(unicodeTld);
|
---|
153 | }
|
---|
154 | }
|
---|
155 | }
|
---|
156 | }
|
---|
157 | // List html entries not in TLD text list
|
---|
158 | for (String key : (new TreeMap<>(htmlInfo)).keySet()) {
|
---|
159 | if (!ianaTlds.contains(key)) {
|
---|
160 | if (isNotInRootZone(key)) {
|
---|
161 | Logging.info("HTML entry not yet in root zone: "+key);
|
---|
162 | } else {
|
---|
163 | Logging.warn("Expected to find text entry for html: "+key);
|
---|
164 | }
|
---|
165 | }
|
---|
166 | }
|
---|
167 | if (!missingTLD.isEmpty()) {
|
---|
168 | printMap(header, missingTLD, "TLD");
|
---|
169 | fail("missing TLD");
|
---|
170 | }
|
---|
171 | if (!missingCC.isEmpty()) {
|
---|
172 | printMap(header, missingCC, "CC");
|
---|
173 | fail("missing CC");
|
---|
174 | }
|
---|
175 | }
|
---|
176 | // Check if internal tables contain any additional entries
|
---|
177 | assertTrue(isInIanaList("INFRASTRUCTURE_TLDS", ianaTlds), String.join(System.lineSeparator(), Logging.getLastErrorAndWarnings()));
|
---|
178 | assertTrue(isInIanaList("COUNTRY_CODE_TLDS", ianaTlds), String.join(System.lineSeparator(), Logging.getLastErrorAndWarnings()));
|
---|
179 | assertTrue(isInIanaList("GENERIC_TLDS", ianaTlds), String.join(System.lineSeparator(), Logging.getLastErrorAndWarnings()));
|
---|
180 | // Don't check local TLDS assertTrue(isInIanaList("LOCAL_TLDS", ianaTlds));
|
---|
181 | }
|
---|
182 |
|
---|
183 | private static void printMap(final String header, Map<String, String> map, String string) {
|
---|
184 | Logging.warn("Entries missing from "+ string +" List\n");
|
---|
185 | if (header != null) {
|
---|
186 | Logging.warn(" // Taken from " + header);
|
---|
187 | }
|
---|
188 | Iterator<Map.Entry<String, String>> it = map.entrySet().iterator();
|
---|
189 | while (it.hasNext()) {
|
---|
190 | Map.Entry<String, String> me = it.next();
|
---|
191 | Logging.warn(" \"" + me.getKey() + "\", // " + me.getValue());
|
---|
192 | }
|
---|
193 | Logging.warn(System.lineSeparator() + "Done");
|
---|
194 | }
|
---|
195 |
|
---|
196 | @SuppressFBWarnings(value = "PERFORMANCE")
|
---|
197 | private static Map<String, String[]> getHtmlInfo(final File f) throws IOException {
|
---|
198 | final Map<String, String[]> info = new HashMap<>();
|
---|
199 |
|
---|
200 | final Pattern domain = Pattern.compile(".*<a href=\"/domains/root/db/([^.]+)\\.html");
|
---|
201 | final Pattern type = Pattern.compile("\\s+<td>([^<]+)</td>");
|
---|
202 | final Pattern comment = Pattern.compile("\\s+<td>([^<]+)</td>");
|
---|
203 |
|
---|
204 | try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), StandardCharsets.UTF_8))) {
|
---|
205 | String line;
|
---|
206 | while ((line = br.readLine()) != null) {
|
---|
207 | Matcher m = domain.matcher(line);
|
---|
208 | if (m.lookingAt()) {
|
---|
209 | String dom = m.group(1);
|
---|
210 | String typ = "??";
|
---|
211 | String com = "??";
|
---|
212 | line = br.readLine();
|
---|
213 | while (line != null && line.matches("^\\s*$")) { // extra blank lines introduced
|
---|
214 | line = br.readLine();
|
---|
215 | }
|
---|
216 | Matcher t = type.matcher(line);
|
---|
217 | if (t.lookingAt()) {
|
---|
218 | typ = t.group(1);
|
---|
219 | line = br.readLine();
|
---|
220 | if (line != null && line.matches("\\s+<!--.*")) {
|
---|
221 | while (line != null && !line.matches(".*-->.*")) {
|
---|
222 | line = br.readLine();
|
---|
223 | }
|
---|
224 | line = br.readLine();
|
---|
225 | }
|
---|
226 | // Should have comment; is it wrapped?
|
---|
227 | while (line != null && !line.matches(".*</td>.*")) {
|
---|
228 | line += " " +br.readLine();
|
---|
229 | }
|
---|
230 | Matcher n = comment.matcher(line);
|
---|
231 | if (n.lookingAt()) {
|
---|
232 | com = n.group(1);
|
---|
233 | }
|
---|
234 | // Don't save unused entries
|
---|
235 | if (!com.contains("Not assigned") && !com.contains("Retired") && !typ.equals("test")) {
|
---|
236 | info.put(dom.toLowerCase(Locale.ENGLISH), new String[]{typ, com});
|
---|
237 | }
|
---|
238 | } else {
|
---|
239 | Logging.error("Unexpected type: " + line);
|
---|
240 | }
|
---|
241 | }
|
---|
242 | }
|
---|
243 | }
|
---|
244 | return info;
|
---|
245 | }
|
---|
246 |
|
---|
247 | /*
|
---|
248 | * Download a file if it is more recent than our cached copy.
|
---|
249 | * Unfortunately the server does not seem to honour If-Modified-Since for the
|
---|
250 | * Html page, so we check if it is newer than the txt file and skip download if so
|
---|
251 | */
|
---|
252 | private static long download(File f, String tldurl, long timestamp) throws IOException {
|
---|
253 | final int HOUR = 60*60*1000; // an hour in ms
|
---|
254 | final long modTime;
|
---|
255 | // For testing purposes, don't download files more than once an hour
|
---|
256 | if (f.canRead()) {
|
---|
257 | modTime = f.lastModified();
|
---|
258 | if (modTime > System.currentTimeMillis()-HOUR) {
|
---|
259 | Logging.debug("Skipping download - found recent " + f);
|
---|
260 | return modTime;
|
---|
261 | }
|
---|
262 | } else {
|
---|
263 | modTime = 0;
|
---|
264 | }
|
---|
265 | HttpURLConnection hc = (HttpURLConnection) new URL(tldurl).openConnection();
|
---|
266 | if (modTime > 0) {
|
---|
267 | SimpleDateFormat sdf = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z"); //Sun, 06 Nov 1994 08:49:37 GMT
|
---|
268 | String since = sdf.format(new Date(modTime));
|
---|
269 | hc.addRequestProperty("If-Modified-Since", since);
|
---|
270 | Logging.debug("Found " + f + " with date " + since);
|
---|
271 | }
|
---|
272 | if (hc.getResponseCode() == 304) {
|
---|
273 | Logging.debug("Already have most recent " + tldurl);
|
---|
274 | } else {
|
---|
275 | Logging.debug("Downloading " + tldurl);
|
---|
276 | byte[] buff = new byte[1024];
|
---|
277 | try (InputStream is = hc.getInputStream();
|
---|
278 | FileOutputStream fos = new FileOutputStream(f)) {
|
---|
279 | int len;
|
---|
280 | while ((len = is.read(buff)) != -1) {
|
---|
281 | fos.write(buff, 0, len);
|
---|
282 | }
|
---|
283 | }
|
---|
284 | Logging.debug("Done");
|
---|
285 | }
|
---|
286 | return f.lastModified();
|
---|
287 | }
|
---|
288 |
|
---|
289 | /**
|
---|
290 | * Check whether the domain is in the root zone currently.
|
---|
291 | * Reads the URL http://www.iana.org/domains/root/db/*domain*.html
|
---|
292 | * (using a local disk cache)
|
---|
293 | * and checks for the string "This domain is not present in the root zone at this time."
|
---|
294 | * @param domain the domain to check
|
---|
295 | * @return true if the string is found
|
---|
296 | */
|
---|
297 | private static boolean isNotInRootZone(String domain) {
|
---|
298 | String tldurl = "http://www.iana.org/domains/root/db/" + domain + ".html";
|
---|
299 | BufferedReader in = null;
|
---|
300 | try {
|
---|
301 | File rootCheck = new File(System.getProperty("java.io.tmpdir"), "tld_" + domain + ".html");
|
---|
302 | download(rootCheck, tldurl, 0L);
|
---|
303 | in = new BufferedReader(new InputStreamReader(new FileInputStream(rootCheck), StandardCharsets.UTF_8));
|
---|
304 | String inputLine;
|
---|
305 | while ((inputLine = in.readLine()) != null) {
|
---|
306 | if (inputLine.contains("This domain is not present in the root zone at this time.")) {
|
---|
307 | return true;
|
---|
308 | }
|
---|
309 | }
|
---|
310 | in.close();
|
---|
311 | } catch (IOException e) {
|
---|
312 | e.printStackTrace();
|
---|
313 | } finally {
|
---|
314 | closeQuietly(in);
|
---|
315 | }
|
---|
316 | return false;
|
---|
317 | }
|
---|
318 |
|
---|
319 | private static void closeQuietly(Closeable in) {
|
---|
320 | if (in != null) {
|
---|
321 | try {
|
---|
322 | in.close();
|
---|
323 | } catch (IOException e) {
|
---|
324 | e.printStackTrace();
|
---|
325 | }
|
---|
326 | }
|
---|
327 | }
|
---|
328 |
|
---|
329 | // isInIanaList and isSorted are split into two methods.
|
---|
330 | // If/when access to the arrays is possible without reflection, the intermediate
|
---|
331 | // methods can be dropped
|
---|
332 | private static boolean isInIanaList(String arrayName, Set<String> ianaTlds) throws Exception {
|
---|
333 | Field f = DomainValidator.class.getDeclaredField(arrayName);
|
---|
334 | final boolean isPrivate = Modifier.isPrivate(f.getModifiers());
|
---|
335 | if (isPrivate) {
|
---|
336 | f.setAccessible(true);
|
---|
337 | }
|
---|
338 | String[] array = (String[]) f.get(null);
|
---|
339 | try {
|
---|
340 | return isInIanaList(arrayName, array, ianaTlds);
|
---|
341 | } finally {
|
---|
342 | if (isPrivate) {
|
---|
343 | f.setAccessible(false);
|
---|
344 | }
|
---|
345 | }
|
---|
346 | }
|
---|
347 |
|
---|
348 | private static boolean isInIanaList(String name, String[] array, Set<String> ianaTlds) {
|
---|
349 | boolean ok = true;
|
---|
350 | for (int i = 0; i < array.length; i++) {
|
---|
351 | if (!ianaTlds.contains(array[i])) {
|
---|
352 | Logging.error(name + " contains unexpected value: " + array[i]);
|
---|
353 | ok = false;
|
---|
354 | }
|
---|
355 | }
|
---|
356 | return ok;
|
---|
357 | }
|
---|
358 |
|
---|
359 | private static boolean isSortedLowerCase(String arrayName) throws Exception {
|
---|
360 | Field f = DomainValidator.class.getDeclaredField(arrayName);
|
---|
361 | final boolean isPrivate = Modifier.isPrivate(f.getModifiers());
|
---|
362 | if (isPrivate) {
|
---|
363 | f.setAccessible(true);
|
---|
364 | }
|
---|
365 | String[] array = (String[]) f.get(null);
|
---|
366 | try {
|
---|
367 | return isSortedLowerCase(arrayName, array);
|
---|
368 | } finally {
|
---|
369 | if (isPrivate) {
|
---|
370 | f.setAccessible(false);
|
---|
371 | }
|
---|
372 | }
|
---|
373 | }
|
---|
374 |
|
---|
375 | private static boolean isLowerCase(String string) {
|
---|
376 | return string.equals(string.toLowerCase(Locale.ENGLISH));
|
---|
377 | }
|
---|
378 |
|
---|
379 | // Check if an array is strictly sorted - and lowerCase
|
---|
380 | private static boolean isSortedLowerCase(String name, String[] array) {
|
---|
381 | boolean sorted = true;
|
---|
382 | boolean strictlySorted = true;
|
---|
383 | final int length = array.length;
|
---|
384 | boolean lowerCase = isLowerCase(array[length-1]); // Check the last entry
|
---|
385 | for (int i = 0; i < length-1; i++) { // compare all but last entry with next
|
---|
386 | final String entry = array[i];
|
---|
387 | final String nextEntry = array[i+1];
|
---|
388 | final int cmp = entry.compareTo(nextEntry);
|
---|
389 | if (cmp > 0) { // out of order
|
---|
390 | Logging.error("Out of order entry: " + entry + " < " + nextEntry + " in " + name);
|
---|
391 | sorted = false;
|
---|
392 | } else if (cmp == 0) {
|
---|
393 | strictlySorted = false;
|
---|
394 | Logging.error("Duplicated entry: " + entry + " in " + name);
|
---|
395 | }
|
---|
396 | if (!isLowerCase(entry)) {
|
---|
397 | Logging.error("Non lowerCase entry: " + entry + " in " + name);
|
---|
398 | lowerCase = false;
|
---|
399 | }
|
---|
400 | }
|
---|
401 | return sorted && strictlySorted && lowerCase;
|
---|
402 | }
|
---|
403 | }
|
---|