source: josm/trunk/src/org/openstreetmap/josm/data/validation/tests/SimilarNamedWays.java@ 7848

Last change on this file since 7848 was 7848, checked in by Don-vip, 9 years ago

fix #3733 - SimilarNamedWays naïvely uses Levenshtein distance and marks a lot of false positives (patch by mdk, brianegge, modified). Rules still need to be stored in JOSM preferences instead of current hardcoding.

  • Property svn:eol-style set to native
File size: 9.6 KB
Line 
1// License: GPL. See LICENSE file for details.
2package org.openstreetmap.josm.data.validation.tests;
3
4import static java.util.regex.Pattern.CASE_INSENSITIVE;
5import static java.util.regex.Pattern.UNICODE_CASE;
6import static org.openstreetmap.josm.tools.I18n.tr;
7
8import java.awt.geom.Point2D;
9import java.util.ArrayList;
10import java.util.Arrays;
11import java.util.HashMap;
12import java.util.List;
13import java.util.Map;
14import java.util.regex.Matcher;
15import java.util.regex.Pattern;
16
17import org.openstreetmap.josm.data.osm.OsmPrimitive;
18import org.openstreetmap.josm.data.osm.Way;
19import org.openstreetmap.josm.data.validation.Severity;
20import org.openstreetmap.josm.data.validation.Test;
21import org.openstreetmap.josm.data.validation.TestError;
22import org.openstreetmap.josm.data.validation.util.ValUtil;
23import org.openstreetmap.josm.gui.progress.ProgressMonitor;
24import org.openstreetmap.josm.tools.MultiMap;
25import org.openstreetmap.josm.tools.Utils;
26
27/**
28 * Checks for similar named ways, symptom of a possible typo. It uses the
29 * Levenshtein distance to check for similarity
30 *
31 * @author frsantos
32 */
33public class SimilarNamedWays extends Test {
34
35 protected static final int SIMILAR_NAMED = 701;
36
37 /** All ways, grouped by cells */
38 private Map<Point2D,List<Way>> cellWays;
39 /** The already detected errors */
40 private MultiMap<Way, Way> errorWays;
41
42 private ArrayList<NormalizeRule> rules = new ArrayList<NormalizeRule>();
43
44 /**
45 * Constructor
46 */
47 public SimilarNamedWays() {
48 super(tr("Similarly named ways"),
49 tr("This test checks for ways with similar names that may have been misspelled."));
50
51 // FIXME: hardcode these rules for now. Replace them with preferences later
52 // See https://josm.openstreetmap.de/ticket/3733#comment:19
53 addRegExprRule("\\d+", "0"); // Highway 66
54 addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave
55 addRegExprRule("^[A-Z] ", "X"); // E Street
56 addSynonyms("east", "west", "north", "south");
57 addSynonyms("first", "second", "third");
58 }
59
60 @Override
61 public void startTest(ProgressMonitor monitor) {
62 super.startTest(monitor);
63 cellWays = new HashMap<>(1000);
64 errorWays = new MultiMap<>();
65 }
66
67 @Override
68 public void endTest() {
69 cellWays = null;
70 errorWays = null;
71 super.endTest();
72 }
73
74 @Override
75 public void visit(Way w) {
76 if (!w.isUsable())
77 return;
78
79 String name = w.get("name");
80 if (name == null || name.length() < 6)
81 return;
82
83 List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays);
84 for (List<Way> ways : theCellWays) {
85 for (Way w2 : ways) {
86 if (errorWays.contains(w, w2) || errorWays.contains(w2, w)) {
87 continue;
88 }
89
90 String name2 = w2.get("name");
91 if (name2 == null || name2.length() < 6) {
92 continue;
93 }
94
95 if (similaryName(name, name2)) {
96 List<OsmPrimitive> primitives = new ArrayList<>(2);
97 primitives.add(w);
98 primitives.add(w2);
99 errors.add(new TestError(this, Severity.WARNING, tr("Similarly named ways"), SIMILAR_NAMED, primitives));
100 errorWays.put(w, w2);
101 }
102 }
103 ways.add(w);
104 }
105 }
106
107 /**
108 * Compute Levenshtein distance
109 *
110 * @param s First word
111 * @param t Second word
112 * @return The distance between words
113 */
114 public static int getLevenshteinDistance(String s, String t) {
115 int[][] d; // matrix
116 int n; // length of s
117 int m; // length of t
118 int i; // iterates through s
119 int j; // iterates through t
120 char s_i; // ith character of s
121 char t_j; // jth character of t
122 int cost; // cost
123
124 // Step 1
125 n = s.length();
126 m = t.length();
127 if (n == 0)
128 return m;
129 if (m == 0)
130 return n;
131 d = new int[n + 1][m + 1];
132
133 // Step 2
134 for (i = 0; i <= n; i++) {
135 d[i][0] = i;
136 }
137 for (j = 0; j <= m; j++) {
138 d[0][j] = j;
139 }
140
141 // Step 3
142 for (i = 1; i <= n; i++) {
143
144 s_i = s.charAt(i - 1);
145
146 // Step 4
147 for (j = 1; j <= m; j++) {
148
149 t_j = t.charAt(j - 1);
150
151 // Step 5
152 if (s_i == t_j) {
153 cost = 0;
154 } else {
155 cost = 1;
156 }
157
158 // Step 6
159 d[i][j] = Utils.min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
160 }
161 }
162
163 // Step 7
164 return d[n][m];
165 }
166
167 /**
168 * Add a regular expression rule.
169 * @param regExpr the regular expression to search for
170 * @param replacement a string to replace with, which should match the expression.
171 */
172 public void addRegExprRule(String regExpr, String replacement) {
173 rules.add(new RegExprRule(regExpr, replacement));
174 }
175
176 /**
177 * Add a rule with synonym words.
178 * @param words words which are synonyms
179 */
180 public void addSynonyms(String... words) {
181 for (String word : words) {
182 rules.add(new SynonymRule(word, words));
183 }
184 }
185
186 /**
187 * Check if two names are similar, but not identical. First both names will be "normalized".
188 * Afterwards the Levenshtein distance will be calculated.<br>
189 * Examples for normalization rules:<br>
190 * <code>replaceAll("\\d+", "0")</code><br>
191 * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true
192 * @param name first name to compare
193 * @param name2 second name to compare
194 * @return true if the normalized names are different but only a "little bit"
195 */
196 public boolean similaryName(String name, String name2) {
197 // check plain strings
198 int distance = getLevenshteinDistance(name, name2);
199 boolean similar = distance>0 && distance<=2;
200
201 // try all rules
202 for (NormalizeRule rule : rules) {
203 int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2));
204 if (levenshteinDistance == 0)
205 // one rule results in identical names: identical
206 return false;
207 else if (levenshteinDistance <= 2) {
208 // 0 < distance <= 2
209 similar = true;
210 }
211 }
212 return similar;
213 }
214
215 public interface NormalizeRule {
216
217 /**
218 * Normalize the string by replacing parts.
219 * @param name name to normalize
220 * @return normalized string
221 */
222 String normalize(String name);
223 }
224
225 public class RegExprRule implements NormalizeRule {
226 private final Pattern regExpr;
227 private final String replacement;
228
229 public RegExprRule(String expression, String replacement) {
230 this.regExpr = Pattern.compile(expression);
231 this.replacement = replacement;
232 }
233
234 @Override
235 public String normalize(String name) {
236 return regExpr.matcher(name).replaceAll(replacement);
237 }
238
239 @Override
240 public String toString() {
241 return "replaceAll(" + regExpr + ", " + replacement + ")";
242 }
243 }
244
245 public class SynonymRule implements NormalizeRule {
246
247 private final String[] words;
248 private final Pattern regExpr;
249 private final String replacement;
250
251 public SynonymRule(String replacement, String[] words) {
252 this.replacement = replacement.toLowerCase();
253 this.words = words;
254
255 // build regular expression for other words (for fast match)
256 StringBuilder expression = new StringBuilder();
257 int maxLength = 0;
258 for (int i = 0; i < words.length; i++) {
259 if (words[i].length() > maxLength) {
260 maxLength = words[i].length();
261 }
262 if (expression.length() > 0) {
263 expression.append("|");
264 }
265 expression.append(Pattern.quote(words[i]));
266 }
267 this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE);
268 }
269
270 @Override
271 public String normalize(String name) {
272 // find first match
273 Matcher matcher = regExpr.matcher(name);
274 if (!matcher.find())
275 return name;
276
277 int start = matcher.start();
278
279 // which word matches?
280 String part = "";
281 for (int i = 0; i < words.length; i++) {
282 String word = words[i];
283 part = name.substring(start, start + word.length());
284 if (word.equalsIgnoreCase(part)) {
285 break;
286 }
287 }
288
289 // replace the word
290 char[] newName = matcher.replaceFirst(replacement).toCharArray();
291
292 // adjust case (replacement is not shorter than matching word!)
293 int minLength = Math.min(replacement.length(), part.length());
294 for (int i = 0; i < minLength; i++) {
295 if (Character.isUpperCase(part.charAt(i))) {
296 newName[start + i] = Character.toUpperCase(newName[start + i]);
297 }
298 }
299
300 return new String(newName);
301 }
302
303 @Override
304 public String toString() {
305 return "synonyms(" + replacement + ", " + Arrays.toString(words) + ")";
306 }
307 }
308}
Note: See TracBrowser for help on using the repository browser.