source: josm/trunk/src/org/openstreetmap/josm/tools/AlphanumComparator.java

Last change on this file was 18983, checked in by taylor.smock, 23 months ago

Fix #23471: fix an inconsistency between fast ASCII sort and slower unicode-aware sort

  • Property svn:eol-style set to native
File size: 9.5 KB
Line 
1// License: GPL. For details, see LICENSE file.
2package org.openstreetmap.josm.tools;
3
4/*
5 * The Alphanum Algorithm is an improved sorting algorithm for strings
6 * containing numbers. Instead of sorting numbers in ASCII order like a standard
7 * sort, this algorithm sorts numbers in numeric order.
8 *
9 * The Alphanum Algorithm is discussed at http://www.DaveKoelle.com
10 *
11 * Released under the MIT License - https://opensource.org/licenses/MIT
12 *
13 * Copyright 2007-2017 David Koelle
14 *
15 * Permission is hereby granted, free of charge, to any person obtaining
16 * a copy of this software and associated documentation files (the "Software"),
17 * to deal in the Software without restriction, including without limitation
18 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
19 * and/or sell copies of the Software, and to permit persons to whom the
20 * Software is furnished to do so, subject to the following conditions:
21 *
22 * The above copyright notice and this permission notice shall be included
23 * in all copies or substantial portions of the Software.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
28 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
29 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
30 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
31 * USE OR OTHER DEALINGS IN THE SOFTWARE.
32 */
33import java.io.Serializable;
34import java.text.Collator;
35import java.util.Arrays;
36import java.util.Comparator;
37
38/**
39 * The Alphanum Algorithm is an improved sorting algorithm for strings
40 * containing numbers: Instead of sorting numbers in ASCII order like a standard
41 * sort, this algorithm sorts numbers in numeric order.
42 * <p>
43 * The Alphanum Algorithm is discussed at
44 * <a href="https://web.archive.org/web/20210602024123/http://www.davekoelle.com/alphanum.html">DaveKoelle.com</a>
45 * <p>
46 * This is an updated version with enhancements made by Daniel Migowski, Andre
47 * Bogus, David Koelle and others.
48 *
49 */
50public final class AlphanumComparator implements Comparator<String>, Serializable {
51 /** {@code true} to use the faster ASCII sorting algorithm. Set to {@code false} when testing compatibility. */
52 static boolean useFastASCIISort = true;
53 /**
54 * The sort order for the fast ASCII sort method.
55 */
56 static final String ASCII_SORT_ORDER =
57 " \r\t\n\f\u000b-_,;:!?/.`^~'\"()[]{}@$*\\&#%+<=>|0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
58
59 private static final long serialVersionUID = 1L;
60
61 private static final AlphanumComparator INSTANCE = new AlphanumComparator();
62 /**
63 * A mapping from ASCII characters to the default {@link Collator} order.
64 * At writing, the default rules can be found in CollationRules#DEFAULTRULES.
65 */
66 private static final byte[] ASCII_MAPPING = new byte[128];
67 static {
68 for (int i = 0; i < ASCII_MAPPING.length; i++) {
69 ASCII_MAPPING[i] = (byte) i; // This is kind of pointless, but it is the default ASCII ordering.
70 }
71 // The control characters are "ignored"
72 Arrays.fill(ASCII_MAPPING, 0, 32, (byte) 0);
73 ASCII_MAPPING[127] = 0; // DEL is ignored.
74 // We have 37 order overrides for symbols; ASCII tables has control characters through 31. 32-47 are symbols.
75 // After the symbols, we have 0-9, and then aA-zZ.
76 // The character order
77 for (int i = 0; i < ASCII_SORT_ORDER.length(); i++) {
78 char c = ASCII_SORT_ORDER.charAt(i);
79 ASCII_MAPPING[c] = (byte) (i + 1);
80 }
81 }
82
83 /**
84 * Replies the unique instance.
85 * @return the unique instance
86 */
87 public static AlphanumComparator getInstance() {
88 return INSTANCE;
89 }
90
91 /**
92 * Constructs a new Alphanum Comparator.
93 */
94 private AlphanumComparator() {
95 }
96
97 /**
98 * Compare two ASCII strings in a manner compatible with the default {@link Collator}
99 * @param string1 The first string to compare
100 * @param len1 The length of the first string
101 * @param string2 The second string to compare
102 * @param len2 The length of the second string
103 * @return See {@link String#compareToIgnoreCase(String)} (e.g. {@code string1.compareToIgnoreCase(string2)}).
104 */
105 private static int compareString(String string1, int len1, String string2, int len2) {
106 int loc1 = 0;
107 int loc2 = 0;
108 while (loc1 < len1 && loc2 < len2) {
109 // Ignore control symbols
110 while (loc1 < len1 - 1 && string1.charAt(loc1) <= 32) {
111 loc1++;
112 }
113 while (loc2 < len2 - 1 && string2.charAt(loc2) <= 32) {
114 loc2++;
115 }
116 if (loc1 >= len1 || loc2 >= len2) break;
117
118 char lower1 = Character.toLowerCase(string1.charAt(loc1));
119 char lower2 = Character.toLowerCase(string2.charAt(loc2));
120
121 final int c1 = ASCII_MAPPING[lower1];
122 final int c2 = ASCII_MAPPING[lower2];
123 if (c1 != c2) {
124 return c1 - c2;
125 }
126 loc1++;
127 loc2++;
128 }
129 return len1 - len2;
130 }
131
132 /**
133 * Returns an alphanum chunk.
134 * Length of string is passed in for improved efficiency (only need to calculate it once).
135 * @param s string
136 * @param slength string length
137 * @param marker position
138 * @return alphanum chunk found at given position
139 */
140 private static String getChunk(String s, int slength, int marker) {
141 final int startMarker = marker;
142 char c = s.charAt(marker);
143 marker++;
144 if (Character.isDigit(c)) {
145 while (marker < slength) {
146 c = s.charAt(marker);
147 if (!Character.isDigit(c)) {
148 break;
149 }
150 marker++;
151 }
152 } else {
153 while (marker < slength) {
154 c = s.charAt(marker);
155 if (Character.isDigit(c)) {
156 break;
157 }
158 marker++;
159 }
160 }
161 return s.substring(startMarker, marker);
162 }
163
164 /**
165 * Check if a string is ASCII only
166 * @param string The string to check
167 * @param stringLength The length of the string (for performance reasons)
168 * @return {@code true} if the string only contains ascii characters
169 */
170 private static boolean isAscii(String string, int stringLength) {
171 for (int i = 0; i < stringLength; i++) {
172 char c = string.charAt(i);
173 if (c > ASCII_MAPPING.length) {
174 return false;
175 }
176 }
177 return true;
178 }
179
180 /**
181 * Compare two string chunks
182 * @param thisChunk The first chunk to compare
183 * @param thisChunkLength The length of the first chunk (for performance reasons)
184 * @param thatChunk The second chunk to compare
185 * @param thatChunkLength The length of the second chunk (for performance reasons)
186 * @return The {@link Comparator} result
187 */
188 private static int compareChunk(String thisChunk, int thisChunkLength, String thatChunk, int thatChunkLength) {
189 int result;
190 if (Character.isDigit(thisChunk.charAt(0)) && Character.isDigit(thatChunk.charAt(0))) {
191 // Simple chunk comparison by length.
192 result = thisChunkLength - thatChunkLength;
193 // If equal, the first different number counts
194 if (result == 0) {
195 for (int i = 0; i < thisChunkLength; i++) {
196 result = thisChunk.charAt(i) - thatChunk.charAt(i);
197 if (result != 0) {
198 return result;
199 }
200 }
201 }
202 } else {
203 // Check if both chunks are ascii only; if so, use a much faster sorting algorithm.
204 if (useFastASCIISort && isAscii(thisChunk, thisChunkLength) && isAscii(thatChunk, thatChunkLength)) {
205 return Utils.clamp(compareString(thisChunk, thisChunkLength, thatChunk, thatChunkLength), -1, 1);
206 }
207 // Instantiate the collator
208 Collator compareOperator = Collator.getInstance();
209 // Compare regardless of accented letters
210 compareOperator.setStrength(Collator.SECONDARY);
211 result = compareOperator.compare(thisChunk, thatChunk);
212 }
213 return result;
214 }
215
216 @Override
217 public int compare(String s1, String s2) {
218 if (s1 == null && s2 == null) {
219 return 0;
220 } else if (s1 == null) {
221 return -1;
222 } else if (s2 == null) {
223 return 1;
224 }
225
226 int thisMarker = 0;
227 int thatMarker = 0;
228 int s1Length = s1.length();
229 int s2Length = s2.length();
230
231 while (thisMarker < s1Length && thatMarker < s2Length) {
232 final String thisChunk = getChunk(s1, s1Length, thisMarker);
233 final int thisChunkLength = thisChunk.length();
234 thisMarker += thisChunkLength;
235
236 String thatChunk = getChunk(s2, s2Length, thatMarker);
237 final int thatChunkLength = thatChunk.length();
238 thatMarker += thatChunkLength;
239
240 // If both chunks contain numeric characters, sort them numerically
241 int result = compareChunk(thisChunk, thisChunkLength, thatChunk, thatChunkLength);
242
243 if (result != 0) {
244 return result;
245 }
246 }
247
248 return s1Length - s2Length;
249 }
250}
Note: See TracBrowser for help on using the repository browser.