| 1 | /*
|
|---|
| 2 | * LZEncoder
|
|---|
| 3 | *
|
|---|
| 4 | * Authors: Lasse Collin <lasse.collin@tukaani.org>
|
|---|
| 5 | * Igor Pavlov <http://7-zip.org/>
|
|---|
| 6 | *
|
|---|
| 7 | * This file has been put into the public domain.
|
|---|
| 8 | * You can do whatever you want with this file.
|
|---|
| 9 | */
|
|---|
| 10 |
|
|---|
| 11 | package org.tukaani.xz.lz;
|
|---|
| 12 |
|
|---|
| 13 | import java.io.OutputStream;
|
|---|
| 14 | import java.io.IOException;
|
|---|
| 15 | import org.tukaani.xz.ArrayCache;
|
|---|
| 16 |
|
|---|
| 17 | public abstract class LZEncoder {
|
|---|
| 18 | public static final int MF_HC4 = 0x04;
|
|---|
| 19 | public static final int MF_BT4 = 0x14;
|
|---|
| 20 |
|
|---|
| 21 | /**
|
|---|
| 22 | * Number of bytes to keep available before the current byte
|
|---|
| 23 | * when moving the LZ window.
|
|---|
| 24 | */
|
|---|
| 25 | private final int keepSizeBefore;
|
|---|
| 26 |
|
|---|
| 27 | /**
|
|---|
| 28 | * Number of bytes that must be available, the current byte included,
|
|---|
| 29 | * to make hasEnoughData return true. Flushing and finishing are
|
|---|
| 30 | * naturally exceptions to this since there cannot be any data after
|
|---|
| 31 | * the end of the uncompressed input.
|
|---|
| 32 | */
|
|---|
| 33 | private final int keepSizeAfter;
|
|---|
| 34 |
|
|---|
| 35 | final int matchLenMax;
|
|---|
| 36 | final int niceLen;
|
|---|
| 37 |
|
|---|
| 38 | final byte[] buf;
|
|---|
| 39 | final int bufSize; // To avoid buf.length with an array-cached buf.
|
|---|
| 40 |
|
|---|
| 41 | int readPos = -1;
|
|---|
| 42 | private int readLimit = -1;
|
|---|
| 43 | private boolean finishing = false;
|
|---|
| 44 | private int writePos = 0;
|
|---|
| 45 | private int pendingSize = 0;
|
|---|
| 46 |
|
|---|
| 47 | static void normalize(int[] positions, int positionsCount,
|
|---|
| 48 | int normalizationOffset) {
|
|---|
| 49 | for (int i = 0; i < positionsCount; ++i) {
|
|---|
| 50 | if (positions[i] <= normalizationOffset)
|
|---|
| 51 | positions[i] = 0;
|
|---|
| 52 | else
|
|---|
| 53 | positions[i] -= normalizationOffset;
|
|---|
| 54 | }
|
|---|
| 55 | }
|
|---|
| 56 |
|
|---|
| 57 | /**
|
|---|
| 58 | * Gets the size of the LZ window buffer that needs to be allocated.
|
|---|
| 59 | */
|
|---|
| 60 | private static int getBufSize(
|
|---|
| 61 | int dictSize, int extraSizeBefore, int extraSizeAfter,
|
|---|
| 62 | int matchLenMax) {
|
|---|
| 63 | int keepSizeBefore = extraSizeBefore + dictSize;
|
|---|
| 64 | int keepSizeAfter = extraSizeAfter + matchLenMax;
|
|---|
| 65 | int reserveSize = Math.min(dictSize / 2 + (256 << 10), 512 << 20);
|
|---|
| 66 | return keepSizeBefore + keepSizeAfter + reserveSize;
|
|---|
| 67 | }
|
|---|
| 68 |
|
|---|
| 69 | /**
|
|---|
| 70 | * Gets approximate memory usage of the LZEncoder base structure and
|
|---|
| 71 | * the match finder as kibibytes.
|
|---|
| 72 | */
|
|---|
| 73 | public static int getMemoryUsage(
|
|---|
| 74 | int dictSize, int extraSizeBefore, int extraSizeAfter,
|
|---|
| 75 | int matchLenMax, int mf) {
|
|---|
| 76 | // Buffer size + a little extra
|
|---|
| 77 | int m = getBufSize(dictSize, extraSizeBefore, extraSizeAfter,
|
|---|
| 78 | matchLenMax) / 1024 + 10;
|
|---|
| 79 |
|
|---|
| 80 | switch (mf) {
|
|---|
| 81 | case MF_HC4:
|
|---|
| 82 | m += HC4.getMemoryUsage(dictSize);
|
|---|
| 83 | break;
|
|---|
| 84 |
|
|---|
| 85 | case MF_BT4:
|
|---|
| 86 | m += BT4.getMemoryUsage(dictSize);
|
|---|
| 87 | break;
|
|---|
| 88 |
|
|---|
| 89 | default:
|
|---|
| 90 | throw new IllegalArgumentException();
|
|---|
| 91 | }
|
|---|
| 92 |
|
|---|
| 93 | return m;
|
|---|
| 94 | }
|
|---|
| 95 |
|
|---|
| 96 | /**
|
|---|
| 97 | * Creates a new LZEncoder.
|
|---|
| 98 | * <p>
|
|---|
| 99 | * @param dictSize dictionary size
|
|---|
| 100 | *
|
|---|
| 101 | * @param extraSizeBefore
|
|---|
| 102 | * number of bytes to keep available in the
|
|---|
| 103 | * history in addition to dictSize
|
|---|
| 104 | *
|
|---|
| 105 | * @param extraSizeAfter
|
|---|
| 106 | * number of bytes that must be available
|
|---|
| 107 | * after current position + matchLenMax
|
|---|
| 108 | *
|
|---|
| 109 | * @param niceLen if a match of at least <code>niceLen</code>
|
|---|
| 110 | * bytes is found, be happy with it and don't
|
|---|
| 111 | * stop looking for longer matches
|
|---|
| 112 | *
|
|---|
| 113 | * @param matchLenMax don't test for matches longer than
|
|---|
| 114 | * <code>matchLenMax</code> bytes
|
|---|
| 115 | *
|
|---|
| 116 | * @param mf match finder ID
|
|---|
| 117 | *
|
|---|
| 118 | * @param depthLimit match finder search depth limit
|
|---|
| 119 | */
|
|---|
| 120 | public static LZEncoder getInstance(
|
|---|
| 121 | int dictSize, int extraSizeBefore, int extraSizeAfter,
|
|---|
| 122 | int niceLen, int matchLenMax, int mf, int depthLimit,
|
|---|
| 123 | ArrayCache arrayCache) {
|
|---|
| 124 | switch (mf) {
|
|---|
| 125 | case MF_HC4:
|
|---|
| 126 | return new HC4(dictSize, extraSizeBefore, extraSizeAfter,
|
|---|
| 127 | niceLen, matchLenMax, depthLimit, arrayCache);
|
|---|
| 128 |
|
|---|
| 129 | case MF_BT4:
|
|---|
| 130 | return new BT4(dictSize, extraSizeBefore, extraSizeAfter,
|
|---|
| 131 | niceLen, matchLenMax, depthLimit, arrayCache);
|
|---|
| 132 | }
|
|---|
| 133 |
|
|---|
| 134 | throw new IllegalArgumentException();
|
|---|
| 135 | }
|
|---|
| 136 |
|
|---|
| 137 | /**
|
|---|
| 138 | * Creates a new LZEncoder. See <code>getInstance</code>.
|
|---|
| 139 | */
|
|---|
| 140 | LZEncoder(int dictSize, int extraSizeBefore, int extraSizeAfter,
|
|---|
| 141 | int niceLen, int matchLenMax, ArrayCache arrayCache) {
|
|---|
| 142 | bufSize = getBufSize(dictSize, extraSizeBefore, extraSizeAfter,
|
|---|
| 143 | matchLenMax);
|
|---|
| 144 | buf = arrayCache.getByteArray(bufSize, false);
|
|---|
| 145 |
|
|---|
| 146 | keepSizeBefore = extraSizeBefore + dictSize;
|
|---|
| 147 | keepSizeAfter = extraSizeAfter + matchLenMax;
|
|---|
| 148 |
|
|---|
| 149 | this.matchLenMax = matchLenMax;
|
|---|
| 150 | this.niceLen = niceLen;
|
|---|
| 151 | }
|
|---|
| 152 |
|
|---|
| 153 | public void putArraysToCache(ArrayCache arrayCache) {
|
|---|
| 154 | arrayCache.putArray(buf);
|
|---|
| 155 | }
|
|---|
| 156 |
|
|---|
| 157 | /**
|
|---|
| 158 | * Sets a preset dictionary. If a preset dictionary is wanted, this
|
|---|
| 159 | * function must be called immediately after creating the LZEncoder
|
|---|
| 160 | * before any data has been encoded.
|
|---|
| 161 | */
|
|---|
| 162 | public void setPresetDict(int dictSize, byte[] presetDict) {
|
|---|
| 163 | assert !isStarted();
|
|---|
| 164 | assert writePos == 0;
|
|---|
| 165 |
|
|---|
| 166 | if (presetDict != null) {
|
|---|
| 167 | // If the preset dictionary buffer is bigger than the dictionary
|
|---|
| 168 | // size, copy only the tail of the preset dictionary.
|
|---|
| 169 | int copySize = Math.min(presetDict.length, dictSize);
|
|---|
| 170 | int offset = presetDict.length - copySize;
|
|---|
| 171 | System.arraycopy(presetDict, offset, buf, 0, copySize);
|
|---|
| 172 | writePos += copySize;
|
|---|
| 173 | skip(copySize);
|
|---|
| 174 | }
|
|---|
| 175 | }
|
|---|
| 176 |
|
|---|
| 177 | /**
|
|---|
| 178 | * Moves data from the end of the buffer to the beginning, discarding
|
|---|
| 179 | * old data and making space for new input.
|
|---|
| 180 | */
|
|---|
| 181 | private void moveWindow() {
|
|---|
| 182 | // Align the move to a multiple of 16 bytes. LZMA2 needs this
|
|---|
| 183 | // because it uses the lowest bits from readPos to get the
|
|---|
| 184 | // alignment of the uncompressed data.
|
|---|
| 185 | int moveOffset = (readPos + 1 - keepSizeBefore) & ~15;
|
|---|
| 186 | int moveSize = writePos - moveOffset;
|
|---|
| 187 | System.arraycopy(buf, moveOffset, buf, 0, moveSize);
|
|---|
| 188 |
|
|---|
| 189 | readPos -= moveOffset;
|
|---|
| 190 | readLimit -= moveOffset;
|
|---|
| 191 | writePos -= moveOffset;
|
|---|
| 192 | }
|
|---|
| 193 |
|
|---|
| 194 | /**
|
|---|
| 195 | * Copies new data into the LZEncoder's buffer.
|
|---|
| 196 | */
|
|---|
| 197 | public int fillWindow(byte[] in, int off, int len) {
|
|---|
| 198 | assert !finishing;
|
|---|
| 199 |
|
|---|
| 200 | // Move the sliding window if needed.
|
|---|
| 201 | if (readPos >= bufSize - keepSizeAfter)
|
|---|
| 202 | moveWindow();
|
|---|
| 203 |
|
|---|
| 204 | // Try to fill the dictionary buffer. If it becomes full,
|
|---|
| 205 | // some of the input bytes may be left unused.
|
|---|
| 206 | if (len > bufSize - writePos)
|
|---|
| 207 | len = bufSize - writePos;
|
|---|
| 208 |
|
|---|
| 209 | System.arraycopy(in, off, buf, writePos, len);
|
|---|
| 210 | writePos += len;
|
|---|
| 211 |
|
|---|
| 212 | // Set the new readLimit but only if there's enough data to allow
|
|---|
| 213 | // encoding of at least one more byte.
|
|---|
| 214 | if (writePos >= keepSizeAfter)
|
|---|
| 215 | readLimit = writePos - keepSizeAfter;
|
|---|
| 216 |
|
|---|
| 217 | processPendingBytes();
|
|---|
| 218 |
|
|---|
| 219 | // Tell the caller how much input we actually copied into
|
|---|
| 220 | // the dictionary.
|
|---|
| 221 | return len;
|
|---|
| 222 | }
|
|---|
| 223 |
|
|---|
| 224 | /**
|
|---|
| 225 | * Process pending bytes remaining from preset dictionary initialization
|
|---|
| 226 | * or encoder flush operation.
|
|---|
| 227 | */
|
|---|
| 228 | private void processPendingBytes() {
|
|---|
| 229 | // After flushing or setting a preset dictionary there will be
|
|---|
| 230 | // pending data that hasn't been ran through the match finder yet.
|
|---|
| 231 | // Run it through the match finder now if there is enough new data
|
|---|
| 232 | // available (readPos < readLimit) that the encoder may encode at
|
|---|
| 233 | // least one more input byte. This way we don't waste any time
|
|---|
| 234 | // looping in the match finder (and marking the same bytes as
|
|---|
| 235 | // pending again) if the application provides very little new data
|
|---|
| 236 | // per write call.
|
|---|
| 237 | if (pendingSize > 0 && readPos < readLimit) {
|
|---|
| 238 | readPos -= pendingSize;
|
|---|
| 239 | int oldPendingSize = pendingSize;
|
|---|
| 240 | pendingSize = 0;
|
|---|
| 241 | skip(oldPendingSize);
|
|---|
| 242 | assert pendingSize < oldPendingSize;
|
|---|
| 243 | }
|
|---|
| 244 | }
|
|---|
| 245 |
|
|---|
| 246 | /**
|
|---|
| 247 | * Returns true if at least one byte has already been run through
|
|---|
| 248 | * the match finder.
|
|---|
| 249 | */
|
|---|
| 250 | public boolean isStarted() {
|
|---|
| 251 | return readPos != -1;
|
|---|
| 252 | }
|
|---|
| 253 |
|
|---|
| 254 | /**
|
|---|
| 255 | * Marks that all the input needs to be made available in
|
|---|
| 256 | * the encoded output.
|
|---|
| 257 | */
|
|---|
| 258 | public void setFlushing() {
|
|---|
| 259 | readLimit = writePos - 1;
|
|---|
| 260 | processPendingBytes();
|
|---|
| 261 | }
|
|---|
| 262 |
|
|---|
| 263 | /**
|
|---|
| 264 | * Marks that there is no more input remaining. The read position
|
|---|
| 265 | * can be advanced until the end of the data.
|
|---|
| 266 | */
|
|---|
| 267 | public void setFinishing() {
|
|---|
| 268 | readLimit = writePos - 1;
|
|---|
| 269 | finishing = true;
|
|---|
| 270 | processPendingBytes();
|
|---|
| 271 | }
|
|---|
| 272 |
|
|---|
| 273 | /**
|
|---|
| 274 | * Tests if there is enough input available to let the caller encode
|
|---|
| 275 | * at least one more byte.
|
|---|
| 276 | */
|
|---|
| 277 | public boolean hasEnoughData(int alreadyReadLen) {
|
|---|
| 278 | return readPos - alreadyReadLen < readLimit;
|
|---|
| 279 | }
|
|---|
| 280 |
|
|---|
| 281 | public void copyUncompressed(OutputStream out, int backward, int len)
|
|---|
| 282 | throws IOException {
|
|---|
| 283 | out.write(buf, readPos + 1 - backward, len);
|
|---|
| 284 | }
|
|---|
| 285 |
|
|---|
| 286 | /**
|
|---|
| 287 | * Get the number of bytes available, including the current byte.
|
|---|
| 288 | * <p>
|
|---|
| 289 | * Note that the result is undefined if <code>getMatches</code> or
|
|---|
| 290 | * <code>skip</code> hasn't been called yet and no preset dictionary
|
|---|
| 291 | * is being used.
|
|---|
| 292 | */
|
|---|
| 293 | public int getAvail() {
|
|---|
| 294 | assert isStarted();
|
|---|
| 295 | return writePos - readPos;
|
|---|
| 296 | }
|
|---|
| 297 |
|
|---|
| 298 | /**
|
|---|
| 299 | * Gets the lowest four bits of the absolute offset of the current byte.
|
|---|
| 300 | * Bits other than the lowest four are undefined.
|
|---|
| 301 | */
|
|---|
| 302 | public int getPos() {
|
|---|
| 303 | return readPos;
|
|---|
| 304 | }
|
|---|
| 305 |
|
|---|
| 306 | /**
|
|---|
| 307 | * Gets the byte from the given backward offset.
|
|---|
| 308 | * <p>
|
|---|
| 309 | * The current byte is at <code>0</code>, the previous byte
|
|---|
| 310 | * at <code>1</code> etc. To get a byte at zero-based distance,
|
|---|
| 311 | * use <code>getByte(dist + 1)<code>.
|
|---|
| 312 | * <p>
|
|---|
| 313 | * This function is equivalent to <code>getByte(0, backward)</code>.
|
|---|
| 314 | */
|
|---|
| 315 | public int getByte(int backward) {
|
|---|
| 316 | return buf[readPos - backward] & 0xFF;
|
|---|
| 317 | }
|
|---|
| 318 |
|
|---|
| 319 | /**
|
|---|
| 320 | * Gets the byte from the given forward minus backward offset.
|
|---|
| 321 | * The forward offset is added to the current position. This lets
|
|---|
| 322 | * one read bytes ahead of the current byte.
|
|---|
| 323 | */
|
|---|
| 324 | public int getByte(int forward, int backward) {
|
|---|
| 325 | return buf[readPos + forward - backward] & 0xFF;
|
|---|
| 326 | }
|
|---|
| 327 |
|
|---|
| 328 | /**
|
|---|
| 329 | * Get the length of a match at the given distance.
|
|---|
| 330 | *
|
|---|
| 331 | * @param dist zero-based distance of the match to test
|
|---|
| 332 | * @param lenLimit don't test for a match longer than this
|
|---|
| 333 | *
|
|---|
| 334 | * @return length of the match; it is in the range [0, lenLimit]
|
|---|
| 335 | */
|
|---|
| 336 | public int getMatchLen(int dist, int lenLimit) {
|
|---|
| 337 | int backPos = readPos - dist - 1;
|
|---|
| 338 | int len = 0;
|
|---|
| 339 |
|
|---|
| 340 | while (len < lenLimit && buf[readPos + len] == buf[backPos + len])
|
|---|
| 341 | ++len;
|
|---|
| 342 |
|
|---|
| 343 | return len;
|
|---|
| 344 | }
|
|---|
| 345 |
|
|---|
| 346 | /**
|
|---|
| 347 | * Get the length of a match at the given distance and forward offset.
|
|---|
| 348 | *
|
|---|
| 349 | * @param forward forward offset
|
|---|
| 350 | * @param dist zero-based distance of the match to test
|
|---|
| 351 | * @param lenLimit don't test for a match longer than this
|
|---|
| 352 | *
|
|---|
| 353 | * @return length of the match; it is in the range [0, lenLimit]
|
|---|
| 354 | */
|
|---|
| 355 | public int getMatchLen(int forward, int dist, int lenLimit) {
|
|---|
| 356 | int curPos = readPos + forward;
|
|---|
| 357 | int backPos = curPos - dist - 1;
|
|---|
| 358 | int len = 0;
|
|---|
| 359 |
|
|---|
| 360 | while (len < lenLimit && buf[curPos + len] == buf[backPos + len])
|
|---|
| 361 | ++len;
|
|---|
| 362 |
|
|---|
| 363 | return len;
|
|---|
| 364 | }
|
|---|
| 365 |
|
|---|
| 366 | /**
|
|---|
| 367 | * Verifies that the matches returned by the match finder are valid.
|
|---|
| 368 | * This is meant to be used in an assert statement. This is totally
|
|---|
| 369 | * useless for actual encoding since match finder's results should
|
|---|
| 370 | * naturally always be valid if it isn't broken.
|
|---|
| 371 | *
|
|---|
| 372 | * @param matches return value from <code>getMatches</code>
|
|---|
| 373 | *
|
|---|
| 374 | * @return true if matches are valid, false if match finder is broken
|
|---|
| 375 | */
|
|---|
| 376 | public boolean verifyMatches(Matches matches) {
|
|---|
| 377 | int lenLimit = Math.min(getAvail(), matchLenMax);
|
|---|
| 378 |
|
|---|
| 379 | for (int i = 0; i < matches.count; ++i)
|
|---|
| 380 | if (getMatchLen(matches.dist[i], lenLimit) != matches.len[i])
|
|---|
| 381 | return false;
|
|---|
| 382 |
|
|---|
| 383 | return true;
|
|---|
| 384 | }
|
|---|
| 385 |
|
|---|
| 386 | /**
|
|---|
| 387 | * Moves to the next byte, checks if there is enough input available,
|
|---|
| 388 | * and returns the amount of input available.
|
|---|
| 389 | *
|
|---|
| 390 | * @param requiredForFlushing
|
|---|
| 391 | * minimum number of available bytes when
|
|---|
| 392 | * flushing; encoding may be continued with
|
|---|
| 393 | * new input after flushing
|
|---|
| 394 | * @param requiredForFinishing
|
|---|
| 395 | * minimum number of available bytes when
|
|---|
| 396 | * finishing; encoding must not be continued
|
|---|
| 397 | * after finishing or the match finder state
|
|---|
| 398 | * may be corrupt
|
|---|
| 399 | *
|
|---|
| 400 | * @return the number of bytes available or zero if there
|
|---|
| 401 | * is not enough input available
|
|---|
| 402 | */
|
|---|
| 403 | int movePos(int requiredForFlushing, int requiredForFinishing) {
|
|---|
| 404 | assert requiredForFlushing >= requiredForFinishing;
|
|---|
| 405 |
|
|---|
| 406 | ++readPos;
|
|---|
| 407 | int avail = writePos - readPos;
|
|---|
| 408 |
|
|---|
| 409 | if (avail < requiredForFlushing) {
|
|---|
| 410 | if (avail < requiredForFinishing || !finishing) {
|
|---|
| 411 | ++pendingSize;
|
|---|
| 412 | avail = 0;
|
|---|
| 413 | }
|
|---|
| 414 | }
|
|---|
| 415 |
|
|---|
| 416 | return avail;
|
|---|
| 417 | }
|
|---|
| 418 |
|
|---|
| 419 | /**
|
|---|
| 420 | * Runs match finder for the next byte and returns the matches found.
|
|---|
| 421 | */
|
|---|
| 422 | public abstract Matches getMatches();
|
|---|
| 423 |
|
|---|
| 424 | /**
|
|---|
| 425 | * Skips the given number of bytes in the match finder.
|
|---|
| 426 | */
|
|---|
| 427 | public abstract void skip(int len);
|
|---|
| 428 | }
|
|---|