source: josm/trunk/src/org/openstreetmap/josm/data/validation/tests/SimilarNamedWays.java@ 9070

Last change on this file since 9070 was 9070, checked in by Don-vip, 8 years ago

Sonar - squid:S2293 - The diamond operator ("<>") should be used

  • Property svn:eol-style set to native
File size: 9.6 KB
Line 
1// License: GPL. For details, see LICENSE file.
2package org.openstreetmap.josm.data.validation.tests;
3
4import static java.util.regex.Pattern.CASE_INSENSITIVE;
5import static java.util.regex.Pattern.UNICODE_CASE;
6import static org.openstreetmap.josm.tools.I18n.tr;
7
8import java.awt.geom.Point2D;
9import java.util.ArrayList;
10import java.util.Arrays;
11import java.util.HashMap;
12import java.util.List;
13import java.util.Locale;
14import java.util.Map;
15import java.util.regex.Matcher;
16import java.util.regex.Pattern;
17
18import org.openstreetmap.josm.data.osm.OsmPrimitive;
19import org.openstreetmap.josm.data.osm.Way;
20import org.openstreetmap.josm.data.validation.Severity;
21import org.openstreetmap.josm.data.validation.Test;
22import org.openstreetmap.josm.data.validation.TestError;
23import org.openstreetmap.josm.data.validation.util.ValUtil;
24import org.openstreetmap.josm.gui.progress.ProgressMonitor;
25import org.openstreetmap.josm.tools.MultiMap;
26import org.openstreetmap.josm.tools.Utils;
27
28/**
29 * Checks for similar named ways, symptom of a possible typo. It uses the
30 * Levenshtein distance to check for similarity
31 *
32 * @author frsantos
33 */
34public class SimilarNamedWays extends Test {
35
36 protected static final int SIMILAR_NAMED = 701;
37
38 /** All ways, grouped by cells */
39 private Map<Point2D, List<Way>> cellWays;
40 /** The already detected errors */
41 private MultiMap<Way, Way> errorWays;
42
43 private final List<NormalizeRule> rules = new ArrayList<>();
44
45 /**
46 * Constructor
47 */
48 public SimilarNamedWays() {
49 super(tr("Similarly named ways"),
50 tr("This test checks for ways with similar names that may have been misspelled."));
51
52 // FIXME: hardcode these rules for now. Replace them with preferences later
53 // See https://josm.openstreetmap.de/ticket/3733#comment:19
54 addRegExprRule("\\d+", "0"); // Highway 66
55 addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave
56 addRegExprRule("^[A-Z] ", "X"); // E Street
57 addSynonyms("east", "west", "north", "south");
58 addSynonyms("first", "second", "third");
59 }
60
61 @Override
62 public void startTest(ProgressMonitor monitor) {
63 super.startTest(monitor);
64 cellWays = new HashMap<>(1000);
65 errorWays = new MultiMap<>();
66 }
67
68 @Override
69 public void endTest() {
70 cellWays = null;
71 errorWays = null;
72 super.endTest();
73 }
74
75 @Override
76 public void visit(Way w) {
77 if (!w.isUsable())
78 return;
79
80 String name = w.get("name");
81 if (name == null || name.length() < 6)
82 return;
83
84 List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays);
85 for (List<Way> ways : theCellWays) {
86 for (Way w2 : ways) {
87 if (errorWays.contains(w, w2) || errorWays.contains(w2, w)) {
88 continue;
89 }
90
91 String name2 = w2.get("name");
92 if (name2 == null || name2.length() < 6) {
93 continue;
94 }
95
96 if (similaryName(name, name2)) {
97 List<OsmPrimitive> primitives = new ArrayList<>(2);
98 primitives.add(w);
99 primitives.add(w2);
100 errors.add(new TestError(this, Severity.WARNING, tr("Similarly named ways"), SIMILAR_NAMED, primitives));
101 errorWays.put(w, w2);
102 }
103 }
104 ways.add(w);
105 }
106 }
107
108 /**
109 * Compute Levenshtein distance
110 *
111 * @param s First word
112 * @param t Second word
113 * @return The distance between words
114 */
115 public static int getLevenshteinDistance(String s, String t) {
116 int[][] d; // matrix
117 int n; // length of s
118 int m; // length of t
119 int i; // iterates through s
120 int j; // iterates through t
121 char s_i; // ith character of s
122 char t_j; // jth character of t
123 int cost; // cost
124
125 // Step 1
126 n = s.length();
127 m = t.length();
128 if (n == 0)
129 return m;
130 if (m == 0)
131 return n;
132 d = new int[n+1][m+1];
133
134 // Step 2
135 for (i = 0; i <= n; i++) {
136 d[i][0] = i;
137 }
138 for (j = 0; j <= m; j++) {
139 d[0][j] = j;
140 }
141
142 // Step 3
143 for (i = 1; i <= n; i++) {
144
145 s_i = s.charAt(i - 1);
146
147 // Step 4
148 for (j = 1; j <= m; j++) {
149
150 t_j = t.charAt(j - 1);
151
152 // Step 5
153 if (s_i == t_j) {
154 cost = 0;
155 } else {
156 cost = 1;
157 }
158
159 // Step 6
160 d[i][j] = Utils.min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
161 }
162 }
163
164 // Step 7
165 return d[n][m];
166 }
167
168 /**
169 * Add a regular expression rule.
170 * @param regExpr the regular expression to search for
171 * @param replacement a string to replace with, which should match the expression.
172 */
173 public void addRegExprRule(String regExpr, String replacement) {
174 rules.add(new RegExprRule(regExpr, replacement));
175 }
176
177 /**
178 * Add a rule with synonym words.
179 * @param words words which are synonyms
180 */
181 public void addSynonyms(String... words) {
182 for (String word : words) {
183 rules.add(new SynonymRule(word, words));
184 }
185 }
186
187 /**
188 * Check if two names are similar, but not identical. First both names will be "normalized".
189 * Afterwards the Levenshtein distance will be calculated.<br>
190 * Examples for normalization rules:<br>
191 * <code>replaceAll("\\d+", "0")</code><br>
192 * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true
193 * @param name first name to compare
194 * @param name2 second name to compare
195 * @return true if the normalized names are different but only a "little bit"
196 */
197 public boolean similaryName(String name, String name2) {
198 // check plain strings
199 int distance = getLevenshteinDistance(name, name2);
200 boolean similar = distance > 0 && distance <= 2;
201
202 // try all rules
203 for (NormalizeRule rule : rules) {
204 int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2));
205 if (levenshteinDistance == 0)
206 // one rule results in identical names: identical
207 return false;
208 else if (levenshteinDistance <= 2) {
209 // 0 < distance <= 2
210 similar = true;
211 }
212 }
213 return similar;
214 }
215
216 public interface NormalizeRule {
217
218 /**
219 * Normalize the string by replacing parts.
220 * @param name name to normalize
221 * @return normalized string
222 */
223 String normalize(String name);
224 }
225
226 public static class RegExprRule implements NormalizeRule {
227 private final Pattern regExpr;
228 private final String replacement;
229
230 public RegExprRule(String expression, String replacement) {
231 this.regExpr = Pattern.compile(expression);
232 this.replacement = replacement;
233 }
234
235 @Override
236 public String normalize(String name) {
237 return regExpr.matcher(name).replaceAll(replacement);
238 }
239
240 @Override
241 public String toString() {
242 return "replaceAll(" + regExpr + ", " + replacement + ')';
243 }
244 }
245
246 public static class SynonymRule implements NormalizeRule {
247
248 private final String[] words;
249 private final Pattern regExpr;
250 private final String replacement;
251
252 public SynonymRule(String replacement, String[] words) {
253 this.replacement = replacement.toLowerCase(Locale.ENGLISH);
254 this.words = words;
255
256 // build regular expression for other words (for fast match)
257 StringBuilder expression = new StringBuilder();
258 int maxLength = 0;
259 for (int i = 0; i < words.length; i++) {
260 if (words[i].length() > maxLength) {
261 maxLength = words[i].length();
262 }
263 if (expression.length() > 0) {
264 expression.append('|');
265 }
266 expression.append(Pattern.quote(words[i]));
267 }
268 this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE);
269 }
270
271 @Override
272 public String normalize(String name) {
273 // find first match
274 Matcher matcher = regExpr.matcher(name);
275 if (!matcher.find())
276 return name;
277
278 int start = matcher.start();
279
280 // which word matches?
281 String part = "";
282 for (int i = 0; i < words.length; i++) {
283 String word = words[i];
284 part = name.substring(start, start + word.length());
285 if (word.equalsIgnoreCase(part)) {
286 break;
287 }
288 }
289
290 // replace the word
291 char[] newName = matcher.replaceFirst(replacement).toCharArray();
292
293 // adjust case (replacement is not shorter than matching word!)
294 int minLength = Math.min(replacement.length(), part.length());
295 for (int i = 0; i < minLength; i++) {
296 if (Character.isUpperCase(part.charAt(i))) {
297 newName[start + i] = Character.toUpperCase(newName[start + i]);
298 }
299 }
300
301 return new String(newName);
302 }
303
304 @Override
305 public String toString() {
306 return "synonyms(" + replacement + ", " + Arrays.toString(words) + ')';
307 }
308 }
309}
Note: See TracBrowser for help on using the repository browser.