1 | /** Copyright (c) 2010 Scott A. Crosby. <scott@sacrosby.com>
|
---|
2 |
|
---|
3 | This program is free software: you can redistribute it and/or modify
|
---|
4 | it under the terms of the GNU Lesser General Public License as
|
---|
5 | published by the Free Software Foundation, either version 3 of the
|
---|
6 | License, or (at your option) any later version.
|
---|
7 |
|
---|
8 | This program is distributed in the hope that it will be useful,
|
---|
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
11 | GNU General Public License for more details.
|
---|
12 |
|
---|
13 | You should have received a copy of the GNU General Public License
|
---|
14 | along with this program. If not, see <http://www.gnu.org/licenses/>.
|
---|
15 |
|
---|
16 | */
|
---|
17 |
|
---|
18 | package crosby.binary;
|
---|
19 |
|
---|
20 | import java.util.Arrays;
|
---|
21 | import java.util.Comparator;
|
---|
22 | import java.util.HashMap;
|
---|
23 |
|
---|
24 | import com.google.protobuf.ByteString;
|
---|
25 |
|
---|
26 | /**
|
---|
27 | * Class for mapping a set of strings to integers, giving frequently occuring
|
---|
28 | * strings small integers.
|
---|
29 | */
|
---|
30 | public class StringTable {
|
---|
31 | public StringTable() {
|
---|
32 | clear();
|
---|
33 | }
|
---|
34 |
|
---|
35 | private HashMap<String, Integer> counts;
|
---|
36 | private HashMap<String, Integer> stringmap;
|
---|
37 | private String set[];
|
---|
38 |
|
---|
39 | public void incr(String s) {
|
---|
40 | if (counts.containsKey(s)) {
|
---|
41 | counts.put(s, new Integer(counts.get(s).intValue() + 1));
|
---|
42 | } else {
|
---|
43 | counts.put(s, new Integer(1));
|
---|
44 | }
|
---|
45 | }
|
---|
46 |
|
---|
47 | /** After the stringtable has been built, return the offset of a string in it.
|
---|
48 | *
|
---|
49 | * Note, value '0' is reserved for use as a delimiter and will not be returned.
|
---|
50 | * @param s
|
---|
51 | * @return
|
---|
52 | */
|
---|
53 | public int getIndex(String s) {
|
---|
54 | return stringmap.get(s).intValue();
|
---|
55 | }
|
---|
56 |
|
---|
57 | public void finish() {
|
---|
58 | Comparator<String> comparator = new Comparator<String>() {
|
---|
59 | @Override
|
---|
60 | public int compare(final String s1, String s2) {
|
---|
61 | int diff = counts.get(s2) - counts.get(s1);
|
---|
62 | return diff;
|
---|
63 | }
|
---|
64 | };
|
---|
65 |
|
---|
66 | /* Sort the stringtable */
|
---|
67 |
|
---|
68 | /*
|
---|
69 | When a string is referenced, strings in the stringtable with indices:
|
---|
70 | 0 : Is reserved (used as a delimiter in tags
|
---|
71 | A: 1 to 127 : Uses can be represented with 1 byte
|
---|
72 | B: 128 to 128**2-1 : Uses can be represented with 2 bytes,
|
---|
73 | C: 128*128 to X : Uses can be represented with 3 bytes in the unlikely case we have >16k strings in a block. No block will contain enough strings that we'll need 4 bytes.
|
---|
74 |
|
---|
75 | There are goals that will improve compression:
|
---|
76 | 1. I want to use 1 bytes for the most frequently occurring strings, then 2 bytes, then 3 bytes.
|
---|
77 | 2. I want to use low integers as frequently as possible (for better
|
---|
78 | entropy encoding out of deflate)
|
---|
79 | 3. I want the stringtable to compress as small as possible.
|
---|
80 |
|
---|
81 | Condition 1 is obvious. Condition 2 makes deflate compress stringtable references more effectively.
|
---|
82 | When compressing entities, delta coding causes small positive integers to occur more frequently
|
---|
83 | than larger integers. Even though a stringtable references to indices of 1 and 127 both use one
|
---|
84 | byte in a decompressed file, the small integer bias causes deflate to use fewer bits to represent
|
---|
85 | the smaller index when compressed. Condition 3 is most effective when adjacent strings in the
|
---|
86 | stringtable have a lot of common substrings.
|
---|
87 |
|
---|
88 | So, when I decide on the master stringtable to use, I put the 127 most frequently occurring
|
---|
89 | strings into A (accomplishing goal 1), and sort them by frequency (to accomplish goal 2), but
|
---|
90 | for B and C, which contain the less progressively less frequently encountered strings, I sort
|
---|
91 | them lexiconographically, to maximize goal 3 and ignoring goal 2.
|
---|
92 |
|
---|
93 | Goal 1 is the most important. Goal 2 helped enough to be worth it, and goal 3 was pretty minor,
|
---|
94 | but all should be re-benchmarked.
|
---|
95 |
|
---|
96 |
|
---|
97 | */
|
---|
98 |
|
---|
99 |
|
---|
100 |
|
---|
101 | set = counts.keySet().toArray(new String[0]);
|
---|
102 | if (set.length > 0) {
|
---|
103 | // Sort based on the frequency.
|
---|
104 | Arrays.sort(set, comparator);
|
---|
105 | // Each group of keys that serializes to the same number of bytes is
|
---|
106 | // sorted lexiconographically.
|
---|
107 | // to maximize deflate compression.
|
---|
108 |
|
---|
109 | // Don't sort the first array. There's not likely to be much benefit, and we want frequent values to be small.
|
---|
110 | //Arrays.sort(set, Math.min(0, set.length-1), Math.min(1 << 7, set.length-1));
|
---|
111 |
|
---|
112 | Arrays.sort(set, Math.min(1 << 7, set.length-1), Math.min(1 << 14,
|
---|
113 | set.length-1));
|
---|
114 | Arrays.sort(set, Math.min(1 << 14, set.length-1), Math.min(1 << 21,
|
---|
115 | set.length-1), comparator);
|
---|
116 | }
|
---|
117 | stringmap = new HashMap<String, Integer>(2 * set.length);
|
---|
118 | for (int i = 0; i < set.length; i++) {
|
---|
119 | stringmap.put(set[i], new Integer(i+1)); // Index 0 is reserved for use as a delimiter.
|
---|
120 | }
|
---|
121 | counts = null;
|
---|
122 | }
|
---|
123 |
|
---|
124 | public void clear() {
|
---|
125 | counts = new HashMap<String, Integer>(100);
|
---|
126 | stringmap = null;
|
---|
127 | set = null;
|
---|
128 | }
|
---|
129 |
|
---|
130 | public Osmformat.StringTable.Builder serialize() {
|
---|
131 | Osmformat.StringTable.Builder builder = Osmformat.StringTable
|
---|
132 | .newBuilder();
|
---|
133 | builder.addS(ByteString.copyFromUtf8("")); // Add a unused string at offset 0 which is used as a delimiter.
|
---|
134 | for (int i = 0; i < set.length; i++)
|
---|
135 | builder.addS(ByteString.copyFromUtf8(set[i]));
|
---|
136 | return builder;
|
---|
137 | }
|
---|
138 | }
|
---|