Changeset 14991 in josm for trunk


Ignore:
Timestamp:
2019-04-14T22:17:22+02:00 (5 years ago)
Author:
Don-vip
Message:

fix #17595 - smarter detection of ZWNJ/ZWJ unicode characters

Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/org/openstreetmap/josm/data/validation/tests/TagChecker.java

    r14952 r14991  
    7575    private static volatile MultiMap<String, String> oftenUsedTags = new MultiMap<>();
    7676
    77     private static final Pattern NON_PRINTING_CONTROL_CHARACTERS = Pattern.compile(
    78             "[\\x00-\\x09\\x0B\\x0C\\x0E-\\x1F\\x7F\\u200c-\\u200f\\u202a-\\u202e]");
     77    private static final Pattern UNWANTED_NON_PRINTING_CONTROL_CHARACTERS = Pattern.compile(
     78            "[\\x00-\\x09\\x0B\\x0C\\x0E-\\x1F\\x7F\\u200e-\\u200f\\u202a-\\u202e]");
    7979
    8080    /** The TagChecker data */
     
    378378
    379379    /**
    380      * Checks given string (key or value) if it contains non-printing control characters (either ASCII or Unicode bidi characters)
     380     * Checks given string (key or value) if it contains unwanted non-printing control characters (either ASCII or Unicode bidi characters)
    381381     * @param s string to check
    382382     * @return {@code true} if {@code s} contains non-printing control characters
    383383     */
    384     private static boolean containsNonPrintingControlCharacter(String s) {
    385         return s != null && s.chars().anyMatch(c -> (isAsciiControlChar(c) && !isNewLineChar(c)) || isBidiControlChar(c));
     384    static boolean containsUnwantedNonPrintingControlCharacter(String s) {
     385        return s != null && !s.isEmpty() && (
     386                isJoiningChar(s.charAt(0)) ||
     387                isJoiningChar(s.charAt(s.length() - 1)) ||
     388                s.chars().anyMatch(c -> (isAsciiControlChar(c) && !isNewLineChar(c)) || isBidiControlChar(c))
     389                );
    386390    }
    387391
     
    394398    }
    395399
     400    private static boolean isJoiningChar(int c) {
     401        return c == 0x200c || c == 0x200d; // ZWNJ, ZWJ
     402    }
     403
    396404    private static boolean isBidiControlChar(int c) {
    397         /* check for range 0x200c to 0x200f (ZWNJ, ZWJ, LRM, RLM) or
     405        /* check for range 0x200e to 0x200f (LRM, RLM) or
    398406                           0x202a to 0x202e (LRE, RLE, PDF, LRO, RLO) */
    399         return (((c & 0xfffffffc) == 0x200c) || ((c >= 0x202a) && (c <= 0x202e)));
    400     }
    401 
    402     static String removeNonPrintingControlCharacters(String s) {
    403         return NON_PRINTING_CONTROL_CHARACTERS.matcher(s).replaceAll("");
     407        return (c >= 0x200e && c <= 0x200f) || (c >= 0x202a && c <= 0x202e);
     408    }
     409
     410    static String removeUnwantedNonPrintingControlCharacters(String s) {
     411        // Remove all unwanted characters
     412        String result = UNWANTED_NON_PRINTING_CONTROL_CHARACTERS.matcher(s).replaceAll("");
     413        // Remove joining characters located at the beginning of the string
     414        while (!result.isEmpty() && isJoiningChar(result.charAt(0))) {
     415            result = result.substring(1);
     416        }
     417        // Remove joining characters located at the end of the string
     418        while (!result.isEmpty() && isJoiningChar(result.charAt(result.length() - 1))) {
     419            result = result.substring(0, result.length() - 1);
     420        }
     421        return result;
    404422    }
    405423
     
    583601        if (!checkValues || value == null)
    584602            return;
    585         if ((containsNonPrintingControlCharacter(value)) && !withErrors.contains(p, "ICV")) {
     603        if ((containsUnwantedNonPrintingControlCharacter(value)) && !withErrors.contains(p, "ICV")) {
    586604            errors.add(TestError.builder(this, Severity.WARNING, LOW_CHAR_VALUE)
    587605                    .message(tr("Tag value contains non-printing character"), s, key)
    588606                    .primitives(p)
    589                     .fix(() -> new ChangePropertyCommand(p, key, removeNonPrintingControlCharacters(value)))
     607                    .fix(() -> new ChangePropertyCommand(p, key, removeUnwantedNonPrintingControlCharacters(value)))
    590608                    .build());
    591609            withErrors.put(p, "ICV");
     
    639657        if (!checkKeys || key == null)
    640658            return;
    641         if ((containsNonPrintingControlCharacter(key)) && !withErrors.contains(p, "ICK")) {
     659        if ((containsUnwantedNonPrintingControlCharacter(key)) && !withErrors.contains(p, "ICK")) {
    642660            errors.add(TestError.builder(this, Severity.WARNING, LOW_CHAR_KEY)
    643661                    .message(tr("Tag key contains non-printing character"), s, key)
    644662                    .primitives(p)
    645                     .fix(() -> new ChangePropertyCommand(p, key, removeNonPrintingControlCharacters(key)))
     663                    .fix(() -> new ChangePropertyCommand(p, key, removeUnwantedNonPrintingControlCharacters(key)))
    646664                    .build());
    647665            withErrors.put(p, "ICK");
  • trunk/test/unit/org/openstreetmap/josm/data/validation/tests/TagCheckerTest.java

    r14933 r14991  
    99import java.util.ArrayList;
    1010import java.util.List;
    11 
     11import java.util.function.Consumer;
     12
     13import org.junit.Assert;
    1214import org.junit.Rule;
    1315import org.junit.Test;
     
    248250    }
    249251
    250     /**
    251      * Unit test of {@link TagChecker#removeNonPrintingControlCharacters}
    252      */
    253     @Test
    254     public void testRemoveUnprintableControlCharacters() {
     252    private static void doTestUnwantedNonprintingControlCharacters(String s, Consumer<Boolean> assertionC, String expected) {
     253        assertionC.accept(TagChecker.containsUnwantedNonPrintingControlCharacter(s));
     254        assertEquals(expected, TagChecker.removeUnwantedNonPrintingControlCharacters(s));
     255    }
     256
     257    private static void doTestUnwantedNonprintingControlCharacters(String s) {
     258        doTestUnwantedNonprintingControlCharacters(s, Assert::assertTrue, "");
     259    }
     260
     261    /**
     262     * Unit test of {@link TagChecker#containsUnwantedNonPrintingControlCharacter}
     263     *            / {@link TagChecker#removeUnwantedNonPrintingControlCharacters}
     264     */
     265    @Test
     266    public void testContainsRemoveUnwantedNonprintingControlCharacters() {
     267        // Check empty string is handled
     268        doTestUnwantedNonprintingControlCharacters("", Assert::assertFalse, "");
    255269        // Check 65 ASCII control characters are removed, except new lines
    256270        for (char c = 0x0; c < 0x20; c++) {
    257271            if (c != '\r' && c != '\n') {
    258                 assertTrue(TagChecker.removeNonPrintingControlCharacters(Character.toString(c)).isEmpty());
     272                doTestUnwantedNonprintingControlCharacters(Character.toString(c));
    259273            } else {
    260                 assertFalse(TagChecker.removeNonPrintingControlCharacters(Character.toString(c)).isEmpty());
     274                doTestUnwantedNonprintingControlCharacters(Character.toString(c), Assert::assertFalse, Character.toString(c));
    261275            }
    262276        }
    263         assertTrue(TagChecker.removeNonPrintingControlCharacters(Character.toString((char) 0x7F)).isEmpty());
    264         // Check 9 Unicode bidi control characters are removed
    265         for (char c = 0x200c; c <= 0x200f; c++) {
    266             assertTrue(TagChecker.removeNonPrintingControlCharacters(Character.toString(c)).isEmpty());
     277        doTestUnwantedNonprintingControlCharacters(Character.toString((char) 0x7F));
     278        // Check 7 Unicode bidi control characters are removed
     279        for (char c = 0x200e; c <= 0x200f; c++) {
     280            doTestUnwantedNonprintingControlCharacters(Character.toString(c));
    267281        }
    268282        for (char c = 0x202a; c <= 0x202e; c++) {
    269             assertTrue(TagChecker.removeNonPrintingControlCharacters(Character.toString(c)).isEmpty());
     283            doTestUnwantedNonprintingControlCharacters(Character.toString(c));
     284        }
     285        // Check joining characters are removed if located at the beginning or end of the string
     286        for (char c = 0x200c; c <= 0x200d; c++) {
     287            final String s = Character.toString(c);
     288            doTestUnwantedNonprintingControlCharacters(s);
     289            doTestUnwantedNonprintingControlCharacters(s + s);
     290            doTestUnwantedNonprintingControlCharacters(s + 'a' + s, Assert::assertTrue, "a");
     291            final String ok = 'a' + s + 'b';
     292            doTestUnwantedNonprintingControlCharacters(ok, Assert::assertFalse, ok);
     293            doTestUnwantedNonprintingControlCharacters(s + ok, Assert::assertTrue, ok);
     294            doTestUnwantedNonprintingControlCharacters(ok + s, Assert::assertTrue, ok);
     295            doTestUnwantedNonprintingControlCharacters(s + ok + s, Assert::assertTrue, ok);
    270296        }
    271297    }
Note: See TracChangeset for help on using the changeset viewer.