| 158 | |
| 159 | /** |
| 160 | * Add a regular expression rule. |
| 161 | * @param regExpr the regular expression to search for |
| 162 | * @param replacement a string to replace with, which should match the expression. |
| 163 | */ |
| 164 | public void addRegExprRule(String regExpr, String replacement) { |
| 165 | rules_.add(new RegExprRule(regExpr, replacement)); |
| 166 | } |
| 167 | |
| 168 | /** |
| 169 | * Add a rule with synonym words. |
| 170 | * @param words words which are synonyms |
| 171 | */ |
| 172 | public void addSynonyms(String... words) { |
| 173 | for (String word : words) { |
| 174 | rules_.add(new SynonymRule(word, words)); |
| 175 | } |
| 176 | } |
| 177 | |
| 178 | /** |
| 179 | * Check if two names are similar, but not identical. First both names will be "normalized". |
| 180 | * Afterwards the Levenshtein distance will be calculated.<br> |
| 181 | * Examples for normalization rules:<br> |
| 182 | * <code>replaceAll("\\d+", "0")</code><br> |
| 183 | * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true |
| 184 | * @param name first name to compare |
| 185 | * @param name2 second name to compare |
| 186 | * @return true if the normalized names are different but only a "little bit" |
| 187 | */ |
| 188 | public boolean similaryName(String name, String name2) { |
| 189 | // check plain strings |
| 190 | int distance = getLevenshteinDistance(name, name2); |
| 191 | boolean similar = distance>0 && distance<=2; |
| 192 | |
| 193 | // try all rules |
| 194 | for (NormalizeRule rule : rules_) { |
| 195 | int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2)); |
| 196 | if (levenshteinDistance == 0) |
| 197 | // one rule results in identical names: identical |
| 198 | return false; |
| 199 | else if (levenshteinDistance <= 2) { |
| 200 | // 0 < distance <= 2 |
| 201 | similar = true; |
| 202 | } |
| 203 | } |
| 204 | return similar; |
| 205 | } |
| 206 | |
| 207 | public interface NormalizeRule { |
| 208 | |
| 209 | /** |
| 210 | * Normalize the string by replacing parts. |
| 211 | * @param name name to normalize |
| 212 | * @return normalized string |
| 213 | */ |
| 214 | String normalize(String name); |
| 215 | |
| 216 | } |
| 217 | |
| 218 | public class RegExprRule implements NormalizeRule { |
| 219 | private Pattern regExpr_; |
| 220 | private String replacement_; |
| 221 | |
| 222 | public RegExprRule(String expression, String replacement) { |
| 223 | regExpr_ = Pattern.compile(expression); |
| 224 | replacement_ = replacement; |
| 225 | } |
| 226 | |
| 227 | @Override |
| 228 | public String normalize(String name) { |
| 229 | return regExpr_.matcher(name).replaceAll(replacement_); |
| 230 | } |
| 231 | |
| 232 | @Override |
| 233 | public String toString() { |
| 234 | return "replaceAll(" + regExpr_ + ", " + replacement_ + ")"; |
| 235 | } |
| 236 | } |
| 237 | |
| 238 | public class SynonymRule implements NormalizeRule { |
| 239 | |
| 240 | private String[] words_; |
| 241 | private Pattern regExpr_; |
| 242 | private String replacement_; |
| 243 | |
| 244 | public SynonymRule(String replacement, String[] words) { |
| 245 | replacement_ = replacement.toLowerCase(); |
| 246 | words_ = words; |
| 247 | |
| 248 | // build regular expression for other words (for fast match) |
| 249 | StringBuilder expression = new StringBuilder(); |
| 250 | int maxLength = 0; |
| 251 | for (int i = 0; i < words.length; i++) { |
| 252 | if (words[i].length() > maxLength) { |
| 253 | maxLength = words[i].length(); |
| 254 | } |
| 255 | if (expression.length() > 0) { |
| 256 | expression.append("|"); |
| 257 | } |
| 258 | expression.append(Pattern.quote(words[i])); |
| 259 | } |
| 260 | regExpr_ = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE); |
| 261 | } |
| 262 | |
| 263 | @Override |
| 264 | public String normalize(String name) { |
| 265 | // find first match |
| 266 | Matcher matcher = regExpr_.matcher(name); |
| 267 | if (!matcher.find()) |
| 268 | return name; |
| 269 | |
| 270 | int start = matcher.start(); |
| 271 | |
| 272 | // which word matches? |
| 273 | String part = ""; |
| 274 | for (int i = 0; i < words_.length; i++) { |
| 275 | String word = words_[i]; |
| 276 | part = name.substring(start, start + word.length()); |
| 277 | if (word.equalsIgnoreCase(part)) { |
| 278 | break; |
| 279 | } |
| 280 | } |
| 281 | |
| 282 | // replace the word |
| 283 | char[] newName = matcher.replaceFirst(replacement_).toCharArray(); |
| 284 | |
| 285 | // adjust case (replacement is not shorter than matching word!) |
| 286 | int minLength = Math.min(replacement_.length(), part.length()); |
| 287 | for (int i = 0; i < minLength; i++) { |
| 288 | if (Character.isUpperCase(part.charAt(i))) { |
| 289 | newName[start + i] = Character.toUpperCase(newName[start + i]); |
| 290 | } |
| 291 | } |
| 292 | |
| 293 | return new String(newName); |
| 294 | } |
| 295 | |
| 296 | @Override |
| 297 | public String toString() { |
| 298 | return "synonyms(" + replacement_ + ", " + Arrays.toString(words_) + ")"; |
| 299 | } |
| 300 | |
| 301 | } |
| 302 | |