Context Navigation

source: josm/trunk/src/org/openstreetmap/josm/data/validation/tests/SimilarNamedWays.java@ 12390

Last change on this file since 12390 was 12390, checked in by michael2402, 7 years ago
See #14794: Document data.validation package and subpackages.
Property svn:eol-style set to `native`
File size: 11.1 KB

Line
1	// License: GPL. For details, see LICENSE file.
2	package org.openstreetmap.josm.data.validation.tests;
3
4	import static java.util.regex.Pattern.CASE_INSENSITIVE;
5	import static java.util.regex.Pattern.UNICODE_CASE;
6	import static org.openstreetmap.josm.tools.I18n.tr;
7
8	import java.awt.geom.Point2D;
9	import java.text.Normalizer;
10	import java.util.ArrayList;
11	import java.util.Arrays;
12	import java.util.HashMap;
13	import java.util.List;
14	import java.util.Locale;
15	import java.util.Map;
16	import java.util.regex.Matcher;
17	import java.util.regex.Pattern;
18
19	import org.openstreetmap.josm.data.osm.OsmPrimitive;
20	import org.openstreetmap.josm.data.osm.Way;
21	import org.openstreetmap.josm.data.validation.Severity;
22	import org.openstreetmap.josm.data.validation.Test;
23	import org.openstreetmap.josm.data.validation.TestError;
24	import org.openstreetmap.josm.data.validation.util.ValUtil;
25	import org.openstreetmap.josm.gui.progress.ProgressMonitor;
26	import org.openstreetmap.josm.tools.MultiMap;
27
28	/**
29	* Checks for similar named ways, symptom of a possible typo. It uses the
30	* Levenshtein distance to check for similarity
31	*
32	* @author frsantos
33	*/
34	public class SimilarNamedWays extends Test {
35
36	protected static final int SIMILAR_NAMED = 701;
37
38	private static final Pattern REMOVE_DIACRITICS = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
39
40	/** All ways, grouped by cells */
41	private Map<Point2D, List<Way>> cellWays;
42	/** The already detected errors */
43	private MultiMap<Way, Way> errorWays;
44
45	private final List<NormalizeRule> rules = new ArrayList<>();
46
47	/**
48	* Constructor
49	*/
50	public SimilarNamedWays() {
51	super(tr("Similarly named ways"),
52	tr("This test checks for ways with similar names that may have been misspelled."));
53
54	// FIXME: hardcode these rules for now. Replace them with preferences later
55	// See https://josm.openstreetmap.de/ticket/3733#comment:19
56	addRegExprRule("\\d+", "0"); // Highway 66
57	addRegExprRule("\\d+(st\|nd\|rd\|th)", "0st"); // 3rd Ave
58	addRegExprRule("^[A-Z] ", "X"); // E Street
59	addSynonyms("east", "west", "north", "south");
60	addSynonyms("first", "second", "third");
61	}
62
63	@Override
64	public void startTest(ProgressMonitor monitor) {
65	super.startTest(monitor);
66	cellWays = new HashMap<>(1000);
67	errorWays = new MultiMap<>();
68	}
69
70	@Override
71	public void endTest() {
72	cellWays = null;
73	errorWays = null;
74	super.endTest();
75	}
76
77	@Override
78	public void visit(Way w) {
79	if (!w.isUsable())
80	return;
81
82	String name = w.get("name");
83	if (name == null \|\| name.length() < 6)
84	return;
85
86	List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays);
87	for (List<Way> ways : theCellWays) {
88	for (Way w2 : ways) {
89	if (errorWays.contains(w, w2) \|\| errorWays.contains(w2, w)) {
90	continue;
91	}
92
93	String name2 = w2.get("name");
94	if (name2 == null \|\| name2.length() < 6) {
95	continue;
96	}
97
98	if (similaryName(name, name2)) {
99	List<OsmPrimitive> primitives = new ArrayList<>(2);
100	primitives.add(w);
101	primitives.add(w2);
102	errors.add(TestError.builder(this, Severity.WARNING, SIMILAR_NAMED)
103	.message(tr("Similarly named ways"))
104	.primitives(primitives)
105	.build());
106	errorWays.put(w, w2);
107	}
108	}
109	ways.add(w);
110	}
111	}
112
113	/**
114	* Compute Levenshtein distance
115	*
116	* @param s First word
117	* @param t Second word
118	* @return The distance between words
119	*/
120	public static int getLevenshteinDistance(String s, String t) {
121	int[][] d; // matrix
122	int n; // length of s
123	int m; // length of t
124	int i; // iterates through s
125	int j; // iterates through t
126	char si; // ith character of s
127	char tj; // jth character of t
128	int cost; // cost
129
130	// Step 1
131	n = s.length();
132	m = t.length();
133	if (n == 0)
134	return m;
135	if (m == 0)
136	return n;
137	d = new int[n+1][m+1];
138
139	// Step 2
140	for (i = 0; i <= n; i++) {
141	d[i][0] = i;
142	}
143	for (j = 0; j <= m; j++) {
144	d[0][j] = j;
145	}
146
147	// Step 3
148	for (i = 1; i <= n; i++) {
149
150	si = s.charAt(i - 1);
151
152	// Step 4
153	for (j = 1; j <= m; j++) {
154
155	tj = t.charAt(j - 1);
156
157	// Step 5
158	if (si == tj) {
159	cost = 0;
160	} else {
161	cost = 1;
162	}
163
164	// Step 6
165	d[i][j] = Math.min(Math.min(d[i - 1][j] + 1, d[i][j - 1] + 1), d[i - 1][j - 1] + cost);
166	}
167	}
168
169	// Step 7
170	return d[n][m];
171	}
172
173	/**
174	* Add a regular expression rule.
175	* @param regExpr the regular expression to search for
176	* @param replacement a string to replace with, which should match the expression.
177	*/
178	public void addRegExprRule(String regExpr, String replacement) {
179	rules.add(new RegExprRule(regExpr, replacement));
180	}
181
182	/**
183	* Add a rule with synonym words.
184	* @param words words which are synonyms
185	*/
186	public void addSynonyms(String... words) {
187	for (String word : words) {
188	rules.add(new SynonymRule(word, words));
189	}
190	}
191
192	/**
193	* Check if two names are similar, but not identical. First both names will be "normalized".
194	* Afterwards the Levenshtein distance will be calculated.<br>
195	* Examples for normalization rules:<br>
196	* <code>replaceAll("\\d+", "0")</code><br>
197	* would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true
198	* @param name first name to compare
199	* @param name2 second name to compare
200	* @return true if the normalized names are different but only a "little bit"
201	*/
202	public boolean similaryName(String name, String name2) {
203	// check plain strings
204	int distance = getLevenshteinDistance(name, name2);
205	boolean similar = distance > 0 && distance <= 2;
206
207	// check if only the case differs, so we don't consider large distance as different strings
208	if (distance > 2 && name.length() == name2.length()) {
209	similar = deAccent(name).equalsIgnoreCase(deAccent(name2));
210	}
211
212	// try all rules
213	for (NormalizeRule rule : rules) {
214	int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2));
215	if (levenshteinDistance == 0)
216	// one rule results in identical names: identical
217	return false;
218	else if (levenshteinDistance <= 2) {
219	// 0 < distance <= 2
220	similar = true;
221	}
222	}
223	return similar;
224	}
225
226	/**
227	* Removes diacritics (accents) from string.
228	* @param str string
229	* @return {@code str} without any diacritic (accent)
230	* @since 12283
231	*/
232	public static String deAccent(String str) {
233	// https://stackoverflow.com/a/1215117/2257172
234	return REMOVE_DIACRITICS.matcher(Normalizer.normalize(str, Normalizer.Form.NFD)).replaceAll("");
235	}
236
237	/**
238	* A normalization that is applied to names before testing them
239	*/
240	@FunctionalInterface
241	public interface NormalizeRule {
242
243	/**
244	* Normalize the string by replacing parts.
245	* @param name name to normalize
246	* @return normalized string
247	*/
248	String normalize(String name);
249	}
250
251	/**
252	* A rule to replace by regular expression,
253	* so that all strings matching the regular expression are handled as if they were {@link RegExprRule#replacement}
254	*/
255	public static class RegExprRule implements NormalizeRule {
256	private final Pattern regExpr;
257	private final String replacement;
258
259	/**
260	* Create a new rule to replace by regular expression
261	* @param expression The regular expression
262	* @param replacement The replacement
263	*/
264	public RegExprRule(String expression, String replacement) {
265	this.regExpr = Pattern.compile(expression);
266	this.replacement = replacement;
267	}
268
269	@Override
270	public String normalize(String name) {
271	return regExpr.matcher(name).replaceAll(replacement);
272	}
273
274	@Override
275	public String toString() {
276	return "replaceAll(" + regExpr + ", " + replacement + ')';
277	}
278	}
279
280	/**
281	* A rule that registers synonyms to a given word
282	*/
283	public static class SynonymRule implements NormalizeRule {
284
285	private final String[] words;
286	private final Pattern regExpr;
287	private final String replacement;
288
289	/**
290	* Create a new {@link SynonymRule}
291	* @param replacement The word to use instead
292	* @param words The synonyms for that word
293	*/
294	public SynonymRule(String replacement, String... words) {
295	this.replacement = replacement.toLowerCase(Locale.ENGLISH);
296	this.words = words;
297
298	// build regular expression for other words (for fast match)
299	StringBuilder expression = new StringBuilder();
300	int maxLength = 0;
301	for (int i = 0; i < words.length; i++) {
302	if (words[i].length() > maxLength) {
303	maxLength = words[i].length();
304	}
305	if (expression.length() > 0) {
306	expression.append('\|');
307	}
308	expression.append(Pattern.quote(words[i]));
309	}
310	this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE);
311	}
312
313	@Override
314	public String normalize(String name) {
315	// find first match
316	Matcher matcher = regExpr.matcher(name);
317	if (!matcher.find())
318	return name;
319
320	int start = matcher.start();
321
322	// which word matches?
323	String part = "";
324	for (int i = 0; i < words.length; i++) {
325	String word = words[i];
326	part = name.substring(start, start + word.length());
327	if (word.equalsIgnoreCase(part)) {
328	break;
329	}
330	}
331
332	// replace the word
333	char[] newName = matcher.replaceFirst(replacement).toCharArray();
334
335	// adjust case (replacement is not shorter than matching word!)
336	int minLength = Math.min(replacement.length(), part.length());
337	for (int i = 0; i < minLength; i++) {
338	if (Character.isUpperCase(part.charAt(i))) {
339	newName[start + i] = Character.toUpperCase(newName[start + i]);
340	}
341	}
342
343	return new String(newName);
344	}
345
346	@Override
347	public String toString() {
348	return "synonyms(" + replacement + ", " + Arrays.toString(words) + ')';
349	}
350	}
351	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: