Context Navigation

source: osm/applications/editors/josm/plugins/pbf/src/crosby/binary/StringTable.java@ 30490

Last change on this file since 30490 was 30490, checked in by donvip, 11 years ago
[josm_pbf] upgrade to protoc 2.5.0 and crosby.binary 1.3.3
File size: 5.4 KB

Line
1	/** Copyright (c) 2010 Scott A. Crosby. <scott@sacrosby.com>
2
3	This program is free software: you can redistribute it and/or modify
4	it under the terms of the GNU Lesser General Public License as
5	published by the Free Software Foundation, either version 3 of the
6	License, or (at your option) any later version.
7
8	This program is distributed in the hope that it will be useful,
9	but WITHOUT ANY WARRANTY; without even the implied warranty of
10	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11	GNU General Public License for more details.
12
13	You should have received a copy of the GNU General Public License
14	along with this program. If not, see <http://www.gnu.org/licenses/>.
15
16	*/
17
18	package crosby.binary;
19
20	import java.util.Arrays;
21	import java.util.Comparator;
22	import java.util.HashMap;
23
24	import com.google.protobuf.ByteString;
25
26	/**
27	* Class for mapping a set of strings to integers, giving frequently occuring
28	* strings small integers.
29	*/
30	public class StringTable {
31	public StringTable() {
32	clear();
33	}
34
35	private HashMap<String, Integer> counts;
36	private HashMap<String, Integer> stringmap;
37	private String set[];
38
39	public void incr(String s) {
40	if (counts.containsKey(s)) {
41	counts.put(s, new Integer(counts.get(s).intValue() + 1));
42	} else {
43	counts.put(s, new Integer(1));
44	}
45	}
46
47	/** After the stringtable has been built, return the offset of a string in it.
48	*
49	* Note, value '0' is reserved for use as a delimiter and will not be returned.
50	* @param s
51	* @return
52	*/
53	public int getIndex(String s) {
54	return stringmap.get(s).intValue();
55	}
56
57	public void finish() {
58	Comparator<String> comparator = new Comparator<String>() {
59	@Override
60	public int compare(final String s1, String s2) {
61	int diff = counts.get(s2) - counts.get(s1);
62	return diff;
63	}
64	};
65
66	/* Sort the stringtable */
67
68	/*
69	When a string is referenced, strings in the stringtable with indices:
70	0 : Is reserved (used as a delimiter in tags
71	A: 1 to 127 : Uses can be represented with 1 byte
72	B: 128 to 128**2-1 : Uses can be represented with 2 bytes,
73	C: 128*128 to X : Uses can be represented with 3 bytes in the unlikely case we have >16k strings in a block. No block will contain enough strings that we'll need 4 bytes.
74
75	There are goals that will improve compression:
76	1. I want to use 1 bytes for the most frequently occurring strings, then 2 bytes, then 3 bytes.
77	2. I want to use low integers as frequently as possible (for better
78	entropy encoding out of deflate)
79	3. I want the stringtable to compress as small as possible.
80
81	Condition 1 is obvious. Condition 2 makes deflate compress stringtable references more effectively.
82	When compressing entities, delta coding causes small positive integers to occur more frequently
83	than larger integers. Even though a stringtable references to indices of 1 and 127 both use one
84	byte in a decompressed file, the small integer bias causes deflate to use fewer bits to represent
85	the smaller index when compressed. Condition 3 is most effective when adjacent strings in the
86	stringtable have a lot of common substrings.
87
88	So, when I decide on the master stringtable to use, I put the 127 most frequently occurring
89	strings into A (accomplishing goal 1), and sort them by frequency (to accomplish goal 2), but
90	for B and C, which contain the less progressively less frequently encountered strings, I sort
91	them lexiconographically, to maximize goal 3 and ignoring goal 2.
92
93	Goal 1 is the most important. Goal 2 helped enough to be worth it, and goal 3 was pretty minor,
94	but all should be re-benchmarked.
95
96
97	*/
98
99
100
101	set = counts.keySet().toArray(new String[0]);
102	if (set.length > 0) {
103	// Sort based on the frequency.
104	Arrays.sort(set, comparator);
105	// Each group of keys that serializes to the same number of bytes is
106	// sorted lexiconographically.
107	// to maximize deflate compression.
108
109	// Don't sort the first array. There's not likely to be much benefit, and we want frequent values to be small.
110	//Arrays.sort(set, Math.min(0, set.length-1), Math.min(1 << 7, set.length-1));
111
112	Arrays.sort(set, Math.min(1 << 7, set.length-1), Math.min(1 << 14,
113	set.length-1));
114	Arrays.sort(set, Math.min(1 << 14, set.length-1), Math.min(1 << 21,
115	set.length-1), comparator);
116	}
117	stringmap = new HashMap<String, Integer>(2 * set.length);
118	for (int i = 0; i < set.length; i++) {
119	stringmap.put(set[i], new Integer(i+1)); // Index 0 is reserved for use as a delimiter.
120	}
121	counts = null;
122	}
123
124	public void clear() {
125	counts = new HashMap<String, Integer>(100);
126	stringmap = null;
127	set = null;
128	}
129
130	public Osmformat.StringTable.Builder serialize() {
131	Osmformat.StringTable.Builder builder = Osmformat.StringTable
132	.newBuilder();
133	builder.addS(ByteString.copyFromUtf8("")); // Add a unused string at offset 0 which is used as a delimiter.
134	for (int i = 0; i < set.length; i++)
135	builder.addS(ByteString.copyFromUtf8(set[i]));
136	return builder;
137	}
138	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: