Context Navigation

source: josm/trunk/src/org/openstreetmap/josm/data/validation/tests/SimilarNamedWays.java@ 7848

Last change on this file since 7848 was 7848, checked in by Don-vip, 9 years ago
fix #3733 - SimilarNamedWays naïvely uses Levenshtein distance and marks a lot of false positives (patch by mdk, brianegge, modified). Rules still need to be stored in JOSM preferences instead of current hardcoding.
Property svn:eol-style set to `native`
File size: 9.6 KB

Line
1	// License: GPL. See LICENSE file for details.
2	package org.openstreetmap.josm.data.validation.tests;
3
4	import static java.util.regex.Pattern.CASE_INSENSITIVE;
5	import static java.util.regex.Pattern.UNICODE_CASE;
6	import static org.openstreetmap.josm.tools.I18n.tr;
7
8	import java.awt.geom.Point2D;
9	import java.util.ArrayList;
10	import java.util.Arrays;
11	import java.util.HashMap;
12	import java.util.List;
13	import java.util.Map;
14	import java.util.regex.Matcher;
15	import java.util.regex.Pattern;
16
17	import org.openstreetmap.josm.data.osm.OsmPrimitive;
18	import org.openstreetmap.josm.data.osm.Way;
19	import org.openstreetmap.josm.data.validation.Severity;
20	import org.openstreetmap.josm.data.validation.Test;
21	import org.openstreetmap.josm.data.validation.TestError;
22	import org.openstreetmap.josm.data.validation.util.ValUtil;
23	import org.openstreetmap.josm.gui.progress.ProgressMonitor;
24	import org.openstreetmap.josm.tools.MultiMap;
25	import org.openstreetmap.josm.tools.Utils;
26
27	/**
28	* Checks for similar named ways, symptom of a possible typo. It uses the
29	* Levenshtein distance to check for similarity
30	*
31	* @author frsantos
32	*/
33	public class SimilarNamedWays extends Test {
34
35	protected static final int SIMILAR_NAMED = 701;
36
37	/** All ways, grouped by cells */
38	private Map<Point2D,List<Way>> cellWays;
39	/** The already detected errors */
40	private MultiMap<Way, Way> errorWays;
41
42	private ArrayList<NormalizeRule> rules = new ArrayList<NormalizeRule>();
43
44	/**
45	* Constructor
46	*/
47	public SimilarNamedWays() {
48	super(tr("Similarly named ways"),
49	tr("This test checks for ways with similar names that may have been misspelled."));
50
51	// FIXME: hardcode these rules for now. Replace them with preferences later
52	// See https://josm.openstreetmap.de/ticket/3733#comment:19
53	addRegExprRule("\\d+", "0"); // Highway 66
54	addRegExprRule("\\d+(st\|nd\|rd\|th)", "0st"); // 3rd Ave
55	addRegExprRule("^[A-Z] ", "X"); // E Street
56	addSynonyms("east", "west", "north", "south");
57	addSynonyms("first", "second", "third");
58	}
59
60	@Override
61	public void startTest(ProgressMonitor monitor) {
62	super.startTest(monitor);
63	cellWays = new HashMap<>(1000);
64	errorWays = new MultiMap<>();
65	}
66
67	@Override
68	public void endTest() {
69	cellWays = null;
70	errorWays = null;
71	super.endTest();
72	}
73
74	@Override
75	public void visit(Way w) {
76	if (!w.isUsable())
77	return;
78
79	String name = w.get("name");
80	if (name == null \|\| name.length() < 6)
81	return;
82
83	List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays);
84	for (List<Way> ways : theCellWays) {
85	for (Way w2 : ways) {
86	if (errorWays.contains(w, w2) \|\| errorWays.contains(w2, w)) {
87	continue;
88	}
89
90	String name2 = w2.get("name");
91	if (name2 == null \|\| name2.length() < 6) {
92	continue;
93	}
94
95	if (similaryName(name, name2)) {
96	List<OsmPrimitive> primitives = new ArrayList<>(2);
97	primitives.add(w);
98	primitives.add(w2);
99	errors.add(new TestError(this, Severity.WARNING, tr("Similarly named ways"), SIMILAR_NAMED, primitives));
100	errorWays.put(w, w2);
101	}
102	}
103	ways.add(w);
104	}
105	}
106
107	/**
108	* Compute Levenshtein distance
109	*
110	* @param s First word
111	* @param t Second word
112	* @return The distance between words
113	*/
114	public static int getLevenshteinDistance(String s, String t) {
115	int[][] d; // matrix
116	int n; // length of s
117	int m; // length of t
118	int i; // iterates through s
119	int j; // iterates through t
120	char s_i; // ith character of s
121	char t_j; // jth character of t
122	int cost; // cost
123
124	// Step 1
125	n = s.length();
126	m = t.length();
127	if (n == 0)
128	return m;
129	if (m == 0)
130	return n;
131	d = new int[n + 1][m + 1];
132
133	// Step 2
134	for (i = 0; i <= n; i++) {
135	d[i][0] = i;
136	}
137	for (j = 0; j <= m; j++) {
138	d[0][j] = j;
139	}
140
141	// Step 3
142	for (i = 1; i <= n; i++) {
143
144	s_i = s.charAt(i - 1);
145
146	// Step 4
147	for (j = 1; j <= m; j++) {
148
149	t_j = t.charAt(j - 1);
150
151	// Step 5
152	if (s_i == t_j) {
153	cost = 0;
154	} else {
155	cost = 1;
156	}
157
158	// Step 6
159	d[i][j] = Utils.min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
160	}
161	}
162
163	// Step 7
164	return d[n][m];
165	}
166
167	/**
168	* Add a regular expression rule.
169	* @param regExpr the regular expression to search for
170	* @param replacement a string to replace with, which should match the expression.
171	*/
172	public void addRegExprRule(String regExpr, String replacement) {
173	rules.add(new RegExprRule(regExpr, replacement));
174	}
175
176	/**
177	* Add a rule with synonym words.
178	* @param words words which are synonyms
179	*/
180	public void addSynonyms(String... words) {
181	for (String word : words) {
182	rules.add(new SynonymRule(word, words));
183	}
184	}
185
186	/**
187	* Check if two names are similar, but not identical. First both names will be "normalized".
188	* Afterwards the Levenshtein distance will be calculated.<br>
189	* Examples for normalization rules:<br>
190	* <code>replaceAll("\\d+", "0")</code><br>
191	* would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true
192	* @param name first name to compare
193	* @param name2 second name to compare
194	* @return true if the normalized names are different but only a "little bit"
195	*/
196	public boolean similaryName(String name, String name2) {
197	// check plain strings
198	int distance = getLevenshteinDistance(name, name2);
199	boolean similar = distance>0 && distance<=2;
200
201	// try all rules
202	for (NormalizeRule rule : rules) {
203	int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2));
204	if (levenshteinDistance == 0)
205	// one rule results in identical names: identical
206	return false;
207	else if (levenshteinDistance <= 2) {
208	// 0 < distance <= 2
209	similar = true;
210	}
211	}
212	return similar;
213	}
214
215	public interface NormalizeRule {
216
217	/**
218	* Normalize the string by replacing parts.
219	* @param name name to normalize
220	* @return normalized string
221	*/
222	String normalize(String name);
223	}
224
225	public class RegExprRule implements NormalizeRule {
226	private final Pattern regExpr;
227	private final String replacement;
228
229	public RegExprRule(String expression, String replacement) {
230	this.regExpr = Pattern.compile(expression);
231	this.replacement = replacement;
232	}
233
234	@Override
235	public String normalize(String name) {
236	return regExpr.matcher(name).replaceAll(replacement);
237	}
238
239	@Override
240	public String toString() {
241	return "replaceAll(" + regExpr + ", " + replacement + ")";
242	}
243	}
244
245	public class SynonymRule implements NormalizeRule {
246
247	private final String[] words;
248	private final Pattern regExpr;
249	private final String replacement;
250
251	public SynonymRule(String replacement, String[] words) {
252	this.replacement = replacement.toLowerCase();
253	this.words = words;
254
255	// build regular expression for other words (for fast match)
256	StringBuilder expression = new StringBuilder();
257	int maxLength = 0;
258	for (int i = 0; i < words.length; i++) {
259	if (words[i].length() > maxLength) {
260	maxLength = words[i].length();
261	}
262	if (expression.length() > 0) {
263	expression.append("\|");
264	}
265	expression.append(Pattern.quote(words[i]));
266	}
267	this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE);
268	}
269
270	@Override
271	public String normalize(String name) {
272	// find first match
273	Matcher matcher = regExpr.matcher(name);
274	if (!matcher.find())
275	return name;
276
277	int start = matcher.start();
278
279	// which word matches?
280	String part = "";
281	for (int i = 0; i < words.length; i++) {
282	String word = words[i];
283	part = name.substring(start, start + word.length());
284	if (word.equalsIgnoreCase(part)) {
285	break;
286	}
287	}
288
289	// replace the word
290	char[] newName = matcher.replaceFirst(replacement).toCharArray();
291
292	// adjust case (replacement is not shorter than matching word!)
293	int minLength = Math.min(replacement.length(), part.length());
294	for (int i = 0; i < minLength; i++) {
295	if (Character.isUpperCase(part.charAt(i))) {
296	newName[start + i] = Character.toUpperCase(newName[start + i]);
297	}
298	}
299
300	return new String(newName);
301	}
302
303	@Override
304	public String toString() {
305	return "synonyms(" + replacement + ", " + Arrays.toString(words) + ")";
306	}
307	}
308	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: