Changeset 14952 in josm


Ignore:
Timestamp:
2019-04-02T00:44:58+02:00 (5 years ago)
Author:
Don-vip
Message:

fix #17546 - detects highly suspicious Unicode characters that have been seen in OSM database

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/org/openstreetmap/josm/data/validation/tests/TagChecker.java

    r14949 r14952  
    408408    }
    409409
     410    /**
     411     * Detects highly suspicious Unicode characters that have been seen in OSM database.
     412     * @param key tag key
     413     * @param b Unicode block of the current character
     414     * @return {@code true} if the current unicode block is very unusual for the given key
     415     */
    410416    private static boolean isUnusualUnicodeBlock(String key, UnicodeBlock b) {
    411         return b == UnicodeBlock.IPA_EXTENSIONS && !key.endsWith(":pronunciation");
     417        return isUnusualPhoneticUse(key, b) || isUnusualBmpUse(b) || isUnusualSmpUse(b);
     418    }
     419
     420    private static boolean isUnusualPhoneticUse(String key, UnicodeBlock b) {
     421        return (b == UnicodeBlock.IPA_EXTENSIONS                        // U+0250..U+02AF
     422             || b == UnicodeBlock.PHONETIC_EXTENSIONS                   // U+1D00..U+1D7F
     423             || b == UnicodeBlock.PHONETIC_EXTENSIONS_SUPPLEMENT)       // U+1D80..U+1DBF
     424                && !key.endsWith(":pronunciation");
     425    }
     426
     427    private static boolean isUnusualBmpUse(UnicodeBlock b) {
     428        // CHECKSTYLE.OFF: BooleanExpressionComplexity
     429        return b == UnicodeBlock.COMBINING_MARKS_FOR_SYMBOLS            // U+20D0..U+20FF
     430            || b == UnicodeBlock.ARROWS                                 // U+2190..U+21FF
     431            || b == UnicodeBlock.MATHEMATICAL_OPERATORS                 // U+2200..U+22FF
     432            || b == UnicodeBlock.ENCLOSED_ALPHANUMERICS                 // U+2460..U+24FF
     433            || b == UnicodeBlock.BOX_DRAWING                            // U+2500..U+257F
     434            || b == UnicodeBlock.GEOMETRIC_SHAPES                       // U+25A0..U+25FF
     435            || b == UnicodeBlock.DINGBATS                               // U+2700..U+27BF
     436            || b == UnicodeBlock.MISCELLANEOUS_SYMBOLS_AND_ARROWS       // U+2B00..U+2BFF
     437            || b == UnicodeBlock.GLAGOLITIC                             // U+2C00..U+2C5F
     438            || b == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO              // U+3130..U+318F
     439            || b == UnicodeBlock.ENCLOSED_CJK_LETTERS_AND_MONTHS        // U+3200..U+32FF
     440            || b == UnicodeBlock.LATIN_EXTENDED_D                       // U+A720..U+A7FF
     441            || b == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS           // U+F900..U+FAFF
     442            || b == UnicodeBlock.ALPHABETIC_PRESENTATION_FORMS          // U+FB00..U+FB4F
     443            || b == UnicodeBlock.VARIATION_SELECTORS                    // U+FE00..U+FE0F
     444            || b == UnicodeBlock.SPECIALS;                              // U+FFF0..U+FFFF
     445            // CHECKSTYLE.ON: BooleanExpressionComplexity
     446    }
     447
     448    private static boolean isUnusualSmpUse(UnicodeBlock b) {
     449        // UnicodeBlock.SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS is only defined in Java 9+
     450        return b == UnicodeBlock.MUSICAL_SYMBOLS                        // U+1D100..U+1D1FF
     451            || b == UnicodeBlock.ENCLOSED_ALPHANUMERIC_SUPPLEMENT       // U+1F100..U+1F1FF
     452            || b == UnicodeBlock.EMOTICONS                              // U+1F600..U+1F64F
     453            || b == UnicodeBlock.TRANSPORT_AND_MAP_SYMBOLS;             // U+1F680..U+1F6FF
    412454    }
    413455
Note: See TracChangeset for help on using the changeset viewer.