source: osm/applications/editors/josm/plugins/pbf/src/crosby/binary/StringTable.java@ 30490

Last change on this file since 30490 was 30490, checked in by donvip, 11 years ago

[josm_pbf] upgrade to protoc 2.5.0 and crosby.binary 1.3.3

File size: 5.4 KB
Line 
1/** Copyright (c) 2010 Scott A. Crosby. <scott@sacrosby.com>
2
3 This program is free software: you can redistribute it and/or modify
4 it under the terms of the GNU Lesser General Public License as
5 published by the Free Software Foundation, either version 3 of the
6 License, or (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program. If not, see <http://www.gnu.org/licenses/>.
15
16*/
17
18package crosby.binary;
19
20import java.util.Arrays;
21import java.util.Comparator;
22import java.util.HashMap;
23
24import com.google.protobuf.ByteString;
25
26/**
27 * Class for mapping a set of strings to integers, giving frequently occuring
28 * strings small integers.
29 */
30public class StringTable {
31 public StringTable() {
32 clear();
33 }
34
35 private HashMap<String, Integer> counts;
36 private HashMap<String, Integer> stringmap;
37 private String set[];
38
39 public void incr(String s) {
40 if (counts.containsKey(s)) {
41 counts.put(s, new Integer(counts.get(s).intValue() + 1));
42 } else {
43 counts.put(s, new Integer(1));
44 }
45 }
46
47 /** After the stringtable has been built, return the offset of a string in it.
48 *
49 * Note, value '0' is reserved for use as a delimiter and will not be returned.
50 * @param s
51 * @return
52 */
53 public int getIndex(String s) {
54 return stringmap.get(s).intValue();
55 }
56
57 public void finish() {
58 Comparator<String> comparator = new Comparator<String>() {
59 @Override
60 public int compare(final String s1, String s2) {
61 int diff = counts.get(s2) - counts.get(s1);
62 return diff;
63 }
64 };
65
66 /* Sort the stringtable */
67
68 /*
69 When a string is referenced, strings in the stringtable with indices:
70 0 : Is reserved (used as a delimiter in tags
71 A: 1 to 127 : Uses can be represented with 1 byte
72 B: 128 to 128**2-1 : Uses can be represented with 2 bytes,
73 C: 128*128 to X : Uses can be represented with 3 bytes in the unlikely case we have >16k strings in a block. No block will contain enough strings that we'll need 4 bytes.
74
75 There are goals that will improve compression:
76 1. I want to use 1 bytes for the most frequently occurring strings, then 2 bytes, then 3 bytes.
77 2. I want to use low integers as frequently as possible (for better
78 entropy encoding out of deflate)
79 3. I want the stringtable to compress as small as possible.
80
81 Condition 1 is obvious. Condition 2 makes deflate compress stringtable references more effectively.
82 When compressing entities, delta coding causes small positive integers to occur more frequently
83 than larger integers. Even though a stringtable references to indices of 1 and 127 both use one
84 byte in a decompressed file, the small integer bias causes deflate to use fewer bits to represent
85 the smaller index when compressed. Condition 3 is most effective when adjacent strings in the
86 stringtable have a lot of common substrings.
87
88 So, when I decide on the master stringtable to use, I put the 127 most frequently occurring
89 strings into A (accomplishing goal 1), and sort them by frequency (to accomplish goal 2), but
90 for B and C, which contain the less progressively less frequently encountered strings, I sort
91 them lexiconographically, to maximize goal 3 and ignoring goal 2.
92
93 Goal 1 is the most important. Goal 2 helped enough to be worth it, and goal 3 was pretty minor,
94 but all should be re-benchmarked.
95
96
97 */
98
99
100
101 set = counts.keySet().toArray(new String[0]);
102 if (set.length > 0) {
103 // Sort based on the frequency.
104 Arrays.sort(set, comparator);
105 // Each group of keys that serializes to the same number of bytes is
106 // sorted lexiconographically.
107 // to maximize deflate compression.
108
109 // Don't sort the first array. There's not likely to be much benefit, and we want frequent values to be small.
110 //Arrays.sort(set, Math.min(0, set.length-1), Math.min(1 << 7, set.length-1));
111
112 Arrays.sort(set, Math.min(1 << 7, set.length-1), Math.min(1 << 14,
113 set.length-1));
114 Arrays.sort(set, Math.min(1 << 14, set.length-1), Math.min(1 << 21,
115 set.length-1), comparator);
116 }
117 stringmap = new HashMap<String, Integer>(2 * set.length);
118 for (int i = 0; i < set.length; i++) {
119 stringmap.put(set[i], new Integer(i+1)); // Index 0 is reserved for use as a delimiter.
120 }
121 counts = null;
122 }
123
124 public void clear() {
125 counts = new HashMap<String, Integer>(100);
126 stringmap = null;
127 set = null;
128 }
129
130 public Osmformat.StringTable.Builder serialize() {
131 Osmformat.StringTable.Builder builder = Osmformat.StringTable
132 .newBuilder();
133 builder.addS(ByteString.copyFromUtf8("")); // Add a unused string at offset 0 which is used as a delimiter.
134 for (int i = 0; i < set.length; i++)
135 builder.addS(ByteString.copyFromUtf8(set[i]));
136 return builder;
137 }
138}
Note: See TracBrowser for help on using the repository browser.