source: josm/trunk/src/org/openstreetmap/josm/data/validation/tests/SimilarNamedWays.java@ 14501

Last change on this file since 14501 was 14373, checked in by simon04, 5 years ago

see #15889 - deprecate SimilarNamedWays.getLevenshteinDistance

  • Property svn:eol-style set to native
File size: 9.5 KB
Line 
1// License: GPL. For details, see LICENSE file.
2package org.openstreetmap.josm.data.validation.tests;
3
4import static java.util.regex.Pattern.CASE_INSENSITIVE;
5import static java.util.regex.Pattern.UNICODE_CASE;
6import static org.openstreetmap.josm.tools.I18n.tr;
7
8import java.awt.geom.Point2D;
9import java.util.ArrayList;
10import java.util.Arrays;
11import java.util.HashMap;
12import java.util.List;
13import java.util.Locale;
14import java.util.Map;
15import java.util.regex.Matcher;
16import java.util.regex.Pattern;
17
18import org.openstreetmap.josm.data.osm.OsmPrimitive;
19import org.openstreetmap.josm.data.osm.Way;
20import org.openstreetmap.josm.data.validation.Severity;
21import org.openstreetmap.josm.data.validation.Test;
22import org.openstreetmap.josm.data.validation.TestError;
23import org.openstreetmap.josm.data.validation.util.ValUtil;
24import org.openstreetmap.josm.gui.progress.ProgressMonitor;
25import org.openstreetmap.josm.tools.MultiMap;
26import org.openstreetmap.josm.tools.Utils;
27
28/**
29 * Checks for similar named ways, symptom of a possible typo. It uses the
30 * Levenshtein distance to check for similarity
31 *
32 * @author frsantos
33 */
34public class SimilarNamedWays extends Test {
35
36 protected static final int SIMILAR_NAMED = 701;
37
38 /** All ways, grouped by cells */
39 private Map<Point2D, List<Way>> cellWays;
40 /** The already detected errors */
41 private MultiMap<Way, Way> errorWays;
42
43 private final List<NormalizeRule> rules = new ArrayList<>();
44
45 /**
46 * Constructor
47 */
48 public SimilarNamedWays() {
49 super(tr("Similarly named ways"),
50 tr("This test checks for ways with similar names that may have been misspelled."));
51
52 // FIXME: hardcode these rules for now. Replace them with preferences later
53 // See https://josm.openstreetmap.de/ticket/3733#comment:19
54 addRegExprRule("\\pN+", "0"); // Unicode numbers: matches "Highway 66" but also persian numbers
55 addRegExprRule("M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "0"); // Roman numbers: matches "Building II"
56 addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave
57 addRegExprRule("^[A-Z] ", "X"); // E Street
58 addSynonyms("east", "west", "north", "south");
59 addSynonyms("first", "second", "third");
60 }
61
62 @Override
63 public void startTest(ProgressMonitor monitor) {
64 super.startTest(monitor);
65 cellWays = new HashMap<>(1000);
66 errorWays = new MultiMap<>();
67 }
68
69 @Override
70 public void endTest() {
71 cellWays = null;
72 errorWays = null;
73 super.endTest();
74 }
75
76 @Override
77 public void visit(Way w) {
78 if (!w.isUsable())
79 return;
80
81 String name = w.get("name");
82 if (name == null || name.length() < 6)
83 return;
84
85 List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays);
86 for (List<Way> ways : theCellWays) {
87 for (Way w2 : ways) {
88 if (errorWays.contains(w, w2) || errorWays.contains(w2, w)) {
89 continue;
90 }
91
92 String name2 = w2.get("name");
93 if (name2 == null || name2.length() < 6) {
94 continue;
95 }
96
97 if (similaryName(name, name2)) {
98 List<OsmPrimitive> primitives = new ArrayList<>(2);
99 primitives.add(w);
100 primitives.add(w2);
101 errors.add(TestError.builder(this, Severity.WARNING, SIMILAR_NAMED)
102 .message(tr("Similarly named ways"))
103 .primitives(primitives)
104 .build());
105 errorWays.put(w, w2);
106 }
107 }
108 ways.add(w);
109 }
110 }
111
112 /**
113 * Compute Levenshtein distance
114 *
115 * @param s First word
116 * @param t Second word
117 * @return The distance between words
118 * @deprecated Use {@link Utils#getLevenshteinDistance} instead
119 */
120 @Deprecated
121 public static int getLevenshteinDistance(String s, String t) {
122 return Utils.getLevenshteinDistance(s, t);
123 }
124
125 /**
126 * Add a regular expression rule.
127 * @param regExpr the regular expression to search for
128 * @param replacement a string to replace with, which should match the expression.
129 */
130 public void addRegExprRule(String regExpr, String replacement) {
131 rules.add(new RegExprRule(regExpr, replacement));
132 }
133
134 /**
135 * Add a rule with synonym words.
136 * @param words words which are synonyms
137 */
138 public void addSynonyms(String... words) {
139 for (String word : words) {
140 rules.add(new SynonymRule(word, words));
141 }
142 }
143
144 /**
145 * Check if two names are similar, but not identical. First both names will be "normalized".
146 * Afterwards the Levenshtein distance will be calculated.<br>
147 * Examples for normalization rules:<br>
148 * <code>replaceAll("\\d+", "0")</code><br>
149 * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true
150 * @param name first name to compare
151 * @param name2 second name to compare
152 * @return true if the normalized names are different but only a "little bit"
153 */
154 public boolean similaryName(String name, String name2) {
155 boolean similar = Utils.isSimilar(name, name2);
156
157 // try all rules
158 for (NormalizeRule rule : rules) {
159 int levenshteinDistance = Utils.getLevenshteinDistance(rule.normalize(name), rule.normalize(name2));
160 if (levenshteinDistance == 0)
161 // one rule results in identical names: identical
162 return false;
163 else if (levenshteinDistance <= 2) {
164 // 0 < distance <= 2
165 similar = true;
166 }
167 }
168 return similar;
169 }
170
171 /**
172 * A normalization that is applied to names before testing them
173 */
174 @FunctionalInterface
175 public interface NormalizeRule {
176
177 /**
178 * Normalize the string by replacing parts.
179 * @param name name to normalize
180 * @return normalized string
181 */
182 String normalize(String name);
183 }
184
185 /**
186 * A rule to replace by regular expression,
187 * so that all strings matching the regular expression are handled as if they were {@link RegExprRule#replacement}
188 */
189 public static class RegExprRule implements NormalizeRule {
190 private final Pattern regExpr;
191 private final String replacement;
192
193 /**
194 * Create a new rule to replace by regular expression
195 * @param expression The regular expression
196 * @param replacement The replacement
197 */
198 public RegExprRule(String expression, String replacement) {
199 this.regExpr = Pattern.compile(expression);
200 this.replacement = replacement;
201 }
202
203 @Override
204 public String normalize(String name) {
205 return regExpr.matcher(name).replaceAll(replacement);
206 }
207
208 @Override
209 public String toString() {
210 return "replaceAll(" + regExpr + ", " + replacement + ')';
211 }
212 }
213
214 /**
215 * A rule that registers synonyms to a given word
216 */
217 public static class SynonymRule implements NormalizeRule {
218
219 private final String[] words;
220 private final Pattern regExpr;
221 private final String replacement;
222
223 /**
224 * Create a new {@link SynonymRule}
225 * @param replacement The word to use instead
226 * @param words The synonyms for that word
227 */
228 public SynonymRule(String replacement, String... words) {
229 this.replacement = replacement.toLowerCase(Locale.ENGLISH);
230 this.words = words;
231
232 // build regular expression for other words (for fast match)
233 StringBuilder expression = new StringBuilder();
234 int maxLength = 0;
235 for (int i = 0; i < words.length; i++) {
236 if (words[i].length() > maxLength) {
237 maxLength = words[i].length();
238 }
239 if (expression.length() > 0) {
240 expression.append('|');
241 }
242 expression.append(Pattern.quote(words[i]));
243 }
244 this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE);
245 }
246
247 @Override
248 public String normalize(String name) {
249 // find first match
250 Matcher matcher = regExpr.matcher(name);
251 if (!matcher.find())
252 return name;
253
254 int start = matcher.start();
255
256 // which word matches?
257 String part = "";
258 for (int i = 0; i < words.length; i++) {
259 String word = words[i];
260 part = name.substring(start, start + word.length());
261 if (word.equalsIgnoreCase(part)) {
262 break;
263 }
264 }
265
266 // replace the word
267 char[] newName = matcher.replaceFirst(replacement).toCharArray();
268
269 // adjust case (replacement is not shorter than matching word!)
270 int minLength = Math.min(replacement.length(), part.length());
271 for (int i = 0; i < minLength; i++) {
272 if (Character.isUpperCase(part.charAt(i))) {
273 newName[start + i] = Character.toUpperCase(newName[start + i]);
274 }
275 }
276
277 return new String(newName);
278 }
279
280 @Override
281 public String toString() {
282 return "synonyms(" + replacement + ", " + Arrays.toString(words) + ')';
283 }
284 }
285}
Note: See TracBrowser for help on using the repository browser.