Context Navigation

← Previous Change
Next Change →

openstreetmap

Timestamp:

2014-12-20T02:55:42+01:00 (12 years ago)

Author:

Don-vip

Message:

fix #3733 - SimilarNamedWays naïvely uses Levenshtein distance and marks a lot of false positives (patch by mdk, brianegge, modified). Rules still need to be stored in JOSM preferences instead of current hardcoding.

Location:

trunk/src/org/openstreetmap/josm/data/validation

Files:

: 2 edited

TestError.java (modified) (1 diff)
tests/SimilarNamedWays.java (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/src/org/openstreetmap/josm/data/validation/TestError.java

r7005	r7848
327	327	@Override
328	328	public String toString() {
329		return "TestError [tester=" + tester + ", code=" + code + "]";
	329	return "TestError [tester=" + tester + ", code=" + code + ", message=" + message + "]";
330	330	}
331	331	}

trunk/src/org/openstreetmap/josm/data/validation/tests/SimilarNamedWays.java

-              r7005
+              r7848
 package org.openstreetmap.josm.data.validation.tests;
+import static java.util.regex.Pattern.CASE_INSENSITIVE;
+import static java.util.regex.Pattern.UNICODE_CASE;
 import static org.openstreetmap.josm.tools.I18n.tr;
 import java.awt.geom.Point2D;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import org.openstreetmap.josm.data.osm.OsmPrimitive;
 …
     private MultiMap<Way, Way> errorWays;
+    private ArrayList<NormalizeRule> rules = new ArrayList<NormalizeRule>();
     /**
      * Constructor
 …
         super(tr("Similarly named ways"),
                 tr("This test checks for ways with similar names that may have been misspelled."));
+        // FIXME: hardcode these rules for now. Replace them with preferences later
+        // See https://josm.openstreetmap.de/ticket/3733#comment:19
+        addRegExprRule("\\d+", "0"); // Highway 66
+        addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave
+        addRegExprRule("^[A-Z] ", "X"); // E Street
+        addSynonyms("east", "west", "north", "south");
+        addSynonyms("first", "second", "third");
+    }
 …
+                }
+                int levenshteinDistance = getLevenshteinDistance(name, name2);
+                if (0 < levenshteinDistance && levenshteinDistance <= 2) {
+                if (similaryName(name, name2)) {
                     List<OsmPrimitive> primitives = new ArrayList<>(2);
                     primitives.add(w);
 …
      * @return The distance between words
      */
+    public int getLevenshteinDistance(String s, String t) {
+    public static int getLevenshteinDistance(String s, String t) {
         int[][] d; // matrix
         int n; // length of s
 …
         return d[n][m];
+    }
+    /**
+     * Add a regular expression rule.
+     * @param regExpr the regular expression to search for
+     * @param replacement a string to replace with, which should match the expression.
+     */
+    public void addRegExprRule(String regExpr, String replacement) {
+        rules.add(new RegExprRule(regExpr, replacement));
+    }
+    /**
+     * Add a rule with synonym words.
+     * @param words words which are synonyms
+     */
+    public void addSynonyms(String... words) {
+        for (String word : words) {
+            rules.add(new SynonymRule(word, words));
+        }
+    }
+    /**
+     * Check if two names are similar, but not identical. First both names will be "normalized".
+     * Afterwards the Levenshtein distance will be calculated.<br>
+     * Examples for normalization rules:<br>
+     * <code>replaceAll("\\d+", "0")</code><br>
+     * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true
+     * @param name first name to compare
+     * @param name2 second name to compare
+     * @return true if the normalized names are different but only a "little bit"
+     */
+    public boolean similaryName(String name, String name2) {
+        // check plain strings
+        int distance = getLevenshteinDistance(name, name2);
+        boolean similar = distance>0 && distance<=2;
+        // try all rules
+        for (NormalizeRule rule : rules) {
+            int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2));
+            if (levenshteinDistance == 0)
+                // one rule results in identical names: identical
+                return false;
+            else if (levenshteinDistance <= 2) {
+                // 0 < distance <= 2
+                similar = true;
+            }
+        }
+        return similar;
+    }
+    public interface NormalizeRule {
+        /**
+         * Normalize the string by replacing parts.
+         * @param name name to normalize
+         * @return normalized string
+         */
+        String normalize(String name);
+    }
+    public class RegExprRule implements NormalizeRule {
+        private final Pattern regExpr;
+        private final String replacement;
+        public RegExprRule(String expression, String replacement) {
+            this.regExpr = Pattern.compile(expression);
+            this.replacement = replacement;
+        }
+        @Override
+        public String normalize(String name) {
+            return regExpr.matcher(name).replaceAll(replacement);
+        }
+        @Override
+        public String toString() {
+            return "replaceAll(" + regExpr + ", " + replacement + ")";
+        }
+    }
+    public class SynonymRule implements NormalizeRule {
+        private final String[] words;
+        private final Pattern regExpr;
+        private final String replacement;
+        public SynonymRule(String replacement, String[] words) {
+            this.replacement = replacement.toLowerCase();
+            this.words = words;
+            // build regular expression for other words (for fast match)
+            StringBuilder expression = new StringBuilder();
+            int maxLength = 0;
+            for (int i = 0; i < words.length; i++) {
+                if (words[i].length() > maxLength) {
+                    maxLength = words[i].length();
+                }
+                if (expression.length() > 0) {
+                    expression.append("|");
+                }
+                expression.append(Pattern.quote(words[i]));
+            }
+            this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE);
+        }
+        @Override
+        public String normalize(String name) {
+            // find first match
+            Matcher matcher = regExpr.matcher(name);
+            if (!matcher.find())
+                return name;
+            int start = matcher.start();
+            // which word matches?
+            String part = "";
+            for (int i = 0; i < words.length; i++) {
+                String word = words[i];
+                part = name.substring(start, start + word.length());
+                if (word.equalsIgnoreCase(part)) {
+                    break;
+                }
+            }
+            // replace the word
+            char[] newName = matcher.replaceFirst(replacement).toCharArray();
+            // adjust case (replacement is not shorter than matching word!)
+            int minLength = Math.min(replacement.length(), part.length());
+            for (int i = 0; i < minLength; i++) {
+                if (Character.isUpperCase(part.charAt(i))) {
+                    newName[start + i] = Character.toUpperCase(newName[start + i]);
+                }
+            }
+            return new String(newName);
+        }
+        @Override
+        public String toString() {
+            return "synonyms(" + replacement + ", " + Arrays.toString(words) + ")";
+        }
+    }
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 7848 in josm for trunk/src/org/openstreetmap

Legend:

trunk/src/org/openstreetmap/josm/data/validation/TestError.java

trunk/src/org/openstreetmap/josm/data/validation/tests/SimilarNamedWays.java

Download in other formats: