| 1 | package org.openstreetmap.josm.tools; |
| 2 | |
| 3 | //// Taken from http://www.bmsi.com/java/#diff |
| 4 | |
| 5 | // http://www.bmsi.com/java/DiffPrint.java could also be useful |
| 6 | |
| 7 | /* |
| 8 | * $Log: Diff.java,v $ |
| 9 | * Revision 1.7 2009/01/19 03:05:26 stuart |
| 10 | * Fix StackOverflow bug with heuristic on reported by Jimmy Han. |
| 11 | * |
| 12 | * Revision 1.6 2003/03/06 22:51:32 stuart |
| 13 | * Convert to CVS |
| 14 | * |
| 15 | * Revision 1.5 2002/07/19 19:14:40 stuart |
| 16 | * fix reverseScript, make change ctor public, update docs |
| 17 | * |
| 18 | * Revision 1.4 2002/04/09 17:53:39 stuart |
| 19 | * More flexible interface for diff() function. |
| 20 | * |
| 21 | * Revision 1.3 2000/03/03 21:58:03 stuart |
| 22 | * move discard_confusing_lines and shift_boundaries to class file_data |
| 23 | * |
| 24 | * Revision 1.2 2000/03/02 16:37:38 stuart |
| 25 | * Add GPL and copyright |
| 26 | * |
| 27 | */ |
| 28 | |
| 29 | import java.util.Hashtable; |
| 30 | |
| 31 | /** A class to compare vectors of objects. The result of comparison |
| 32 | is a list of <code>change</code> objects which form an |
| 33 | edit script. The objects compared are traditionally lines |
| 34 | of text from two files. Comparison options such as "ignore |
| 35 | whitespace" are implemented by modifying the <code>equals</code> |
| 36 | and <code>hashcode</code> methods for the objects compared. |
| 37 | <p> |
| 38 | The basic algorithm is described in: </br> |
| 39 | "An O(ND) Difference Algorithm and its Variations", Eugene Myers, |
| 40 | Algorithmica Vol. 1 No. 2, 1986, p 251. |
| 41 | <p> |
| 42 | This class outputs different results from GNU diff 1.15 on some |
| 43 | inputs. Our results are actually better (smaller change list, smaller |
| 44 | total size of changes), but it would be nice to know why. Perhaps |
| 45 | there is a memory overwrite bug in GNU diff 1.15. |
| 46 | |
| 47 | @author Stuart D. Gathman, translated from GNU diff 1.15 |
| 48 | Copyright (C) 2000 Business Management Systems, Inc. |
| 49 | <p> |
| 50 | This program is free software; you can redistribute it and/or modify |
| 51 | it under the terms of the GNU General Public License as published by |
| 52 | the Free Software Foundation; either version 1, or (at your option) |
| 53 | any later version. |
| 54 | <p> |
| 55 | This program is distributed in the hope that it will be useful, |
| 56 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 57 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 58 | GNU General Public License for more details. |
| 59 | <p> |
| 60 | You should have received a copy of the <a href=COPYING.txt> |
| 61 | GNU General Public License</a> |
| 62 | along with this program; if not, write to the Free Software |
| 63 | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| 64 | |
| 65 | */ |
| 66 | |
| 67 | public class Diff { |
| 68 | |
| 69 | /** Prepare to find differences between two arrays. Each element of |
| 70 | the arrays is translated to an "equivalence number" based on |
| 71 | the result of <code>equals</code>. The original Object arrays |
| 72 | are no longer needed for computing the differences. They will |
| 73 | be needed again later to print the results of the comparison as |
| 74 | an edit script, if desired. |
| 75 | */ |
| 76 | public Diff(Object[] a,Object[] b) { |
| 77 | Hashtable h = new Hashtable(a.length + b.length); |
| 78 | filevec[0] = new file_data(a,h); |
| 79 | filevec[1] = new file_data(b,h); |
| 80 | } |
| 81 | |
| 82 | /** 1 more than the maximum equivalence value used for this or its |
| 83 | sibling file. */ |
| 84 | private int equiv_max = 1; |
| 85 | |
| 86 | /** When set to true, the comparison uses a heuristic to speed it up. |
| 87 | With this heuristic, for files with a constant small density |
| 88 | of changes, the algorithm is linear in the file size. */ |
| 89 | public boolean heuristic = false; |
| 90 | |
| 91 | /** When set to true, the algorithm returns a guarranteed minimal |
| 92 | set of changes. This makes things slower, sometimes much slower. */ |
| 93 | public boolean no_discards = false; |
| 94 | |
| 95 | private int[] xvec, yvec; /* Vectors being compared. */ |
| 96 | private int[] fdiag; /* Vector, indexed by diagonal, containing |
| 97 | the X coordinate of the point furthest |
| 98 | along the given diagonal in the forward |
| 99 | search of the edit matrix. */ |
| 100 | private int[] bdiag; /* Vector, indexed by diagonal, containing |
| 101 | the X coordinate of the point furthest |
| 102 | along the given diagonal in the backward |
| 103 | search of the edit matrix. */ |
| 104 | private int fdiagoff, bdiagoff; |
| 105 | private final file_data[] filevec = new file_data[2]; |
| 106 | private int cost; |
| 107 | /** Snakes bigger than this are considered "big". */ |
| 108 | private static final int SNAKE_LIMIT = 20; |
| 109 | |
| 110 | /** Find the midpoint of the shortest edit script for a specified |
| 111 | portion of the two files. |
| 112 | |
| 113 | We scan from the beginnings of the files, and simultaneously from the ends, |
| 114 | doing a breadth-first search through the space of edit-sequence. |
| 115 | When the two searches meet, we have found the midpoint of the shortest |
| 116 | edit sequence. |
| 117 | |
| 118 | The value returned is the number of the diagonal on which the midpoint lies. |
| 119 | The diagonal number equals the number of inserted lines minus the number |
| 120 | of deleted lines (counting only lines before the midpoint). |
| 121 | The edit cost is stored into COST; this is the total number of |
| 122 | lines inserted or deleted (counting only lines before the midpoint). |
| 123 | |
| 124 | This function assumes that the first lines of the specified portions |
| 125 | of the two files do not match, and likewise that the last lines do not |
| 126 | match. The caller must trim matching lines from the beginning and end |
| 127 | of the portions it is going to specify. |
| 128 | |
| 129 | Note that if we return the "wrong" diagonal value, or if |
| 130 | the value of bdiag at that diagonal is "wrong", |
| 131 | the worst this can do is cause suboptimal diff output. |
| 132 | It cannot cause incorrect diff output. */ |
| 133 | |
| 134 | private int diag (int xoff, int xlim, int yoff, int ylim) { |
| 135 | final int[] fd = fdiag; // Give the compiler a chance. |
| 136 | final int[] bd = bdiag; // Additional help for the compiler. |
| 137 | final int[] xv = xvec; // Still more help for the compiler. |
| 138 | final int[] yv = yvec; // And more and more . . . |
| 139 | final int dmin = xoff - ylim; // Minimum valid diagonal. |
| 140 | final int dmax = xlim - yoff; // Maximum valid diagonal. |
| 141 | final int fmid = xoff - yoff; // Center diagonal of top-down search. |
| 142 | final int bmid = xlim - ylim; // Center diagonal of bottom-up search. |
| 143 | int fmin = fmid, fmax = fmid; // Limits of top-down search. |
| 144 | int bmin = bmid, bmax = bmid; // Limits of bottom-up search. |
| 145 | /* True if southeast corner is on an odd |
| 146 | diagonal with respect to the northwest. */ |
| 147 | final boolean odd = (fmid - bmid & 1) != 0; |
| 148 | |
| 149 | fd[fdiagoff + fmid] = xoff; |
| 150 | bd[bdiagoff + bmid] = xlim; |
| 151 | |
| 152 | for (int c = 1;; ++c) |
| 153 | { |
| 154 | int d; /* Active diagonal. */ |
| 155 | boolean big_snake = false; |
| 156 | |
| 157 | /* Extend the top-down search by an edit step in each diagonal. */ |
| 158 | if (fmin > dmin) { |
| 159 | fd[fdiagoff + --fmin - 1] = -1; |
| 160 | } else { |
| 161 | ++fmin; |
| 162 | } |
| 163 | if (fmax < dmax) { |
| 164 | fd[fdiagoff + ++fmax + 1] = -1; |
| 165 | } else { |
| 166 | --fmax; |
| 167 | } |
| 168 | for (d = fmax; d >= fmin; d -= 2) |
| 169 | { |
| 170 | int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1]; |
| 171 | |
| 172 | if (tlo >= thi) { |
| 173 | x = tlo + 1; |
| 174 | } else { |
| 175 | x = thi; |
| 176 | } |
| 177 | oldx = x; |
| 178 | y = x - d; |
| 179 | while (x < xlim && y < ylim && xv[x] == yv[y]) { |
| 180 | ++x; ++y; |
| 181 | } |
| 182 | if (x - oldx > SNAKE_LIMIT) { |
| 183 | big_snake = true; |
| 184 | } |
| 185 | fd[fdiagoff + d] = x; |
| 186 | if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) |
| 187 | { |
| 188 | cost = 2 * c - 1; |
| 189 | return d; |
| 190 | } |
| 191 | } |
| 192 | |
| 193 | /* Similar extend the bottom-up search. */ |
| 194 | if (bmin > dmin) { |
| 195 | bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE; |
| 196 | } else { |
| 197 | ++bmin; |
| 198 | } |
| 199 | if (bmax < dmax) { |
| 200 | bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE; |
| 201 | } else { |
| 202 | --bmax; |
| 203 | } |
| 204 | for (d = bmax; d >= bmin; d -= 2) |
| 205 | { |
| 206 | int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1]; |
| 207 | |
| 208 | if (tlo < thi) { |
| 209 | x = tlo; |
| 210 | } else { |
| 211 | x = thi - 1; |
| 212 | } |
| 213 | oldx = x; |
| 214 | y = x - d; |
| 215 | while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) { |
| 216 | --x; --y; |
| 217 | } |
| 218 | if (oldx - x > SNAKE_LIMIT) { |
| 219 | big_snake = true; |
| 220 | } |
| 221 | bd[bdiagoff + d] = x; |
| 222 | if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) |
| 223 | { |
| 224 | cost = 2 * c; |
| 225 | return d; |
| 226 | } |
| 227 | } |
| 228 | |
| 229 | /* Heuristic: check occasionally for a diagonal that has made |
| 230 | lots of progress compared with the edit distance. |
| 231 | If we have any such, find the one that has made the most |
| 232 | progress and return it as if it had succeeded. |
| 233 | |
| 234 | With this heuristic, for files with a constant small density |
| 235 | of changes, the algorithm is linear in the file size. */ |
| 236 | |
| 237 | if (c > 200 && big_snake && heuristic) |
| 238 | { |
| 239 | int best = 0; |
| 240 | int bestpos = -1; |
| 241 | |
| 242 | for (d = fmax; d >= fmin; d -= 2) |
| 243 | { |
| 244 | int dd = d - fmid; |
| 245 | int x = fd[fdiagoff + d]; |
| 246 | int y = x - d; |
| 247 | int v = (x - xoff) * 2 - dd; |
| 248 | if (v > 12 * (c + (dd < 0 ? -dd : dd))) |
| 249 | { |
| 250 | if (v > best |
| 251 | && xoff + SNAKE_LIMIT <= x && x < xlim |
| 252 | && yoff + SNAKE_LIMIT <= y && y < ylim) |
| 253 | { |
| 254 | /* We have a good enough best diagonal; |
| 255 | now insist that it end with a significant snake. */ |
| 256 | int k; |
| 257 | |
| 258 | for (k = 1; xvec[x - k] == yvec[y - k]; k++) |
| 259 | if (k == SNAKE_LIMIT) |
| 260 | { |
| 261 | best = v; |
| 262 | bestpos = d; |
| 263 | break; |
| 264 | } |
| 265 | } |
| 266 | } |
| 267 | } |
| 268 | if (best > 0) |
| 269 | { |
| 270 | cost = 2 * c - 1; |
| 271 | return bestpos; |
| 272 | } |
| 273 | |
| 274 | best = 0; |
| 275 | for (d = bmax; d >= bmin; d -= 2) |
| 276 | { |
| 277 | int dd = d - bmid; |
| 278 | int x = bd[bdiagoff + d]; |
| 279 | int y = x - d; |
| 280 | int v = (xlim - x) * 2 + dd; |
| 281 | if (v > 12 * (c + (dd < 0 ? -dd : dd))) |
| 282 | { |
| 283 | if (v > best |
| 284 | && xoff < x && x <= xlim - SNAKE_LIMIT |
| 285 | && yoff < y && y <= ylim - SNAKE_LIMIT) |
| 286 | { |
| 287 | /* We have a good enough best diagonal; |
| 288 | now insist that it end with a significant snake. */ |
| 289 | int k; |
| 290 | |
| 291 | for (k = 0; xvec[x + k] == yvec[y + k]; k++) |
| 292 | if (k == SNAKE_LIMIT) |
| 293 | { |
| 294 | best = v; |
| 295 | bestpos = d; |
| 296 | break; |
| 297 | } |
| 298 | } |
| 299 | } |
| 300 | } |
| 301 | if (best > 0) |
| 302 | { |
| 303 | cost = 2 * c - 1; |
| 304 | return bestpos; |
| 305 | } |
| 306 | } |
| 307 | } |
| 308 | } |
| 309 | |
| 310 | /** Compare in detail contiguous subsequences of the two files |
| 311 | which are known, as a whole, to match each other. |
| 312 | |
| 313 | The results are recorded in the vectors filevec[N].changed_flag, by |
| 314 | storing a 1 in the element for each line that is an insertion or deletion. |
| 315 | |
| 316 | The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. |
| 317 | |
| 318 | Note that XLIM, YLIM are exclusive bounds. |
| 319 | All line numbers are origin-0 and discarded lines are not counted. */ |
| 320 | |
| 321 | private void compareseq (int xoff, int xlim, int yoff, int ylim) { |
| 322 | /* Slide down the bottom initial diagonal. */ |
| 323 | while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) { |
| 324 | ++xoff; ++yoff; |
| 325 | } |
| 326 | /* Slide up the top initial diagonal. */ |
| 327 | while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) { |
| 328 | --xlim; --ylim; |
| 329 | } |
| 330 | |
| 331 | /* Handle simple cases. */ |
| 332 | if (xoff == xlim) { |
| 333 | while (yoff < ylim) { |
| 334 | filevec[1].changed_flag[1+filevec[1].realindexes[yoff++]] = true; |
| 335 | } |
| 336 | } else if (yoff == ylim) { |
| 337 | while (xoff < xlim) { |
| 338 | filevec[0].changed_flag[1+filevec[0].realindexes[xoff++]] = true; |
| 339 | } |
| 340 | } else |
| 341 | { |
| 342 | /* Find a point of correspondence in the middle of the files. */ |
| 343 | |
| 344 | int d = diag (xoff, xlim, yoff, ylim); |
| 345 | int c = cost; |
| 346 | int f = fdiag[fdiagoff + d]; |
| 347 | int b = bdiag[bdiagoff + d]; |
| 348 | |
| 349 | if (c == 1) |
| 350 | /* This should be impossible, because it implies that |
| 351 | one of the two subsequences is empty, |
| 352 | and that case was handled above without calling `diag'. |
| 353 | Let's verify that this is true. */ |
| 354 | throw new IllegalArgumentException("Empty subsequence"); |
| 355 | else |
| 356 | { |
| 357 | /* Use that point to split this problem into two subproblems. */ |
| 358 | compareseq (xoff, b, yoff, b - d); |
| 359 | /* This used to use f instead of b, |
| 360 | but that is incorrect! |
| 361 | It is not necessarily the case that diagonal d |
| 362 | has a snake from b to f. */ |
| 363 | compareseq (b, xlim, b - d, ylim); |
| 364 | } |
| 365 | } |
| 366 | } |
| 367 | |
| 368 | /** Discard lines from one file that have no matches in the other file. |
| 369 | */ |
| 370 | |
| 371 | private void discard_confusing_lines() { |
| 372 | filevec[0].discard_confusing_lines(filevec[1]); |
| 373 | filevec[1].discard_confusing_lines(filevec[0]); |
| 374 | } |
| 375 | |
| 376 | private boolean inhibit = false; |
| 377 | |
| 378 | /** Adjust inserts/deletes of blank lines to join changes |
| 379 | as much as possible. |
| 380 | */ |
| 381 | |
| 382 | private void shift_boundaries() { |
| 383 | if (inhibit) |
| 384 | return; |
| 385 | filevec[0].shift_boundaries(filevec[1]); |
| 386 | filevec[1].shift_boundaries(filevec[0]); |
| 387 | } |
| 388 | |
| 389 | public interface ScriptBuilder { |
| 390 | /** Scan the tables of which lines are inserted and deleted, |
| 391 | producing an edit script. |
| 392 | @param changed0 true for lines in first file which do not match 2nd |
| 393 | @param len0 number of lines in first file |
| 394 | @param changed1 true for lines in 2nd file which do not match 1st |
| 395 | @param len1 number of lines in 2nd file |
| 396 | @return a linked list of changes - or null |
| 397 | */ |
| 398 | public change build_script( |
| 399 | boolean[] changed0,int len0, |
| 400 | boolean[] changed1,int len1 |
| 401 | ); |
| 402 | } |
| 403 | |
| 404 | /** Scan the tables of which lines are inserted and deleted, |
| 405 | producing an edit script in reverse order. */ |
| 406 | |
| 407 | static class ReverseScript implements ScriptBuilder { |
| 408 | public change build_script( |
| 409 | final boolean[] changed0,int len0, |
| 410 | final boolean[] changed1,int len1) |
| 411 | { |
| 412 | change script = null; |
| 413 | int i0 = 0, i1 = 0; |
| 414 | while (i0 < len0 || i1 < len1) { |
| 415 | if (changed0[1+i0] || changed1[1+i1]) { |
| 416 | int line0 = i0, line1 = i1; |
| 417 | |
| 418 | /* Find # lines changed here in each file. */ |
| 419 | while (changed0[1+i0]) { |
| 420 | ++i0; |
| 421 | } |
| 422 | while (changed1[1+i1]) { |
| 423 | ++i1; |
| 424 | } |
| 425 | |
| 426 | /* Record this change. */ |
| 427 | script = new change(line0, line1, i0 - line0, i1 - line1, script); |
| 428 | } |
| 429 | |
| 430 | /* We have reached lines in the two files that match each other. */ |
| 431 | i0++; i1++; |
| 432 | } |
| 433 | |
| 434 | return script; |
| 435 | } |
| 436 | } |
| 437 | |
| 438 | static class ForwardScript implements ScriptBuilder { |
| 439 | /** Scan the tables of which lines are inserted and deleted, |
| 440 | producing an edit script in forward order. */ |
| 441 | public change build_script( |
| 442 | final boolean[] changed0,int len0, |
| 443 | final boolean[] changed1,int len1) |
| 444 | { |
| 445 | change script = null; |
| 446 | int i0 = len0, i1 = len1; |
| 447 | |
| 448 | while (i0 >= 0 || i1 >= 0) |
| 449 | { |
| 450 | if (changed0[i0] || changed1[i1]) |
| 451 | { |
| 452 | int line0 = i0, line1 = i1; |
| 453 | |
| 454 | /* Find # lines changed here in each file. */ |
| 455 | while (changed0[i0]) { |
| 456 | --i0; |
| 457 | } |
| 458 | while (changed1[i1]) { |
| 459 | --i1; |
| 460 | } |
| 461 | |
| 462 | /* Record this change. */ |
| 463 | script = new change(i0, i1, line0 - i0, line1 - i1, script); |
| 464 | } |
| 465 | |
| 466 | /* We have reached lines in the two files that match each other. */ |
| 467 | i0--; i1--; |
| 468 | } |
| 469 | |
| 470 | return script; |
| 471 | } |
| 472 | } |
| 473 | |
| 474 | /** Standard ScriptBuilders. */ |
| 475 | public final static ScriptBuilder |
| 476 | forwardScript = new ForwardScript(), |
| 477 | reverseScript = new ReverseScript(); |
| 478 | |
| 479 | /* Report the differences of two files. DEPTH is the current directory |
| 480 | depth. */ |
| 481 | public final change diff_2(final boolean reverse) { |
| 482 | return diff(reverse ? reverseScript : forwardScript); |
| 483 | } |
| 484 | |
| 485 | /** Get the results of comparison as an edit script. The script |
| 486 | is described by a list of changes. The standard ScriptBuilder |
| 487 | implementations provide for forward and reverse edit scripts. |
| 488 | Alternate implementations could, for instance, list common elements |
| 489 | instead of differences. |
| 490 | @param bld an object to build the script from change flags |
| 491 | @return the head of a list of changes |
| 492 | */ |
| 493 | public change diff(final ScriptBuilder bld) { |
| 494 | |
| 495 | /* Some lines are obviously insertions or deletions |
| 496 | because they don't match anything. Detect them now, |
| 497 | and avoid even thinking about them in the main comparison algorithm. */ |
| 498 | |
| 499 | discard_confusing_lines (); |
| 500 | |
| 501 | /* Now do the main comparison algorithm, considering just the |
| 502 | undiscarded lines. */ |
| 503 | |
| 504 | xvec = filevec[0].undiscarded; |
| 505 | yvec = filevec[1].undiscarded; |
| 506 | |
| 507 | int diags = |
| 508 | filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3; |
| 509 | fdiag = new int[diags]; |
| 510 | fdiagoff = filevec[1].nondiscarded_lines + 1; |
| 511 | bdiag = new int[diags]; |
| 512 | bdiagoff = filevec[1].nondiscarded_lines + 1; |
| 513 | |
| 514 | compareseq (0, filevec[0].nondiscarded_lines, |
| 515 | 0, filevec[1].nondiscarded_lines); |
| 516 | fdiag = null; |
| 517 | bdiag = null; |
| 518 | |
| 519 | /* Modify the results slightly to make them prettier |
| 520 | in cases where that can validly be done. */ |
| 521 | |
| 522 | shift_boundaries (); |
| 523 | |
| 524 | /* Get the results of comparison in the form of a chain |
| 525 | of `struct change's -- an edit script. */ |
| 526 | return bld.build_script( |
| 527 | filevec[0].changed_flag, |
| 528 | filevec[0].buffered_lines, |
| 529 | filevec[1].changed_flag, |
| 530 | filevec[1].buffered_lines |
| 531 | ); |
| 532 | |
| 533 | } |
| 534 | |
| 535 | /** The result of comparison is an "edit script": a chain of change objects. |
| 536 | Each change represents one place where some lines are deleted |
| 537 | and some are inserted. |
| 538 | |
| 539 | LINE0 and LINE1 are the first affected lines in the two files (origin 0). |
| 540 | DELETED is the number of lines deleted here from file 0. |
| 541 | INSERTED is the number of lines inserted here in file 1. |
| 542 | |
| 543 | If DELETED is 0 then LINE0 is the number of the line before |
| 544 | which the insertion was done; vice versa for INSERTED and LINE1. */ |
| 545 | |
| 546 | public static class change { |
| 547 | /** Previous or next edit command. */ |
| 548 | public change link; |
| 549 | /** # lines of file 1 changed here. */ |
| 550 | public final int inserted; |
| 551 | /** # lines of file 0 changed here. */ |
| 552 | public final int deleted; |
| 553 | /** Line number of 1st deleted line. */ |
| 554 | public final int line0; |
| 555 | /** Line number of 1st inserted line. */ |
| 556 | public final int line1; |
| 557 | |
| 558 | /** Cons an additional entry onto the front of an edit script OLD. |
| 559 | LINE0 and LINE1 are the first affected lines in the two files (origin 0). |
| 560 | DELETED is the number of lines deleted here from file 0. |
| 561 | INSERTED is the number of lines inserted here in file 1. |
| 562 | |
| 563 | If DELETED is 0 then LINE0 is the number of the line before |
| 564 | which the insertion was done; vice versa for INSERTED and LINE1. */ |
| 565 | public change(int line0, int line1, int deleted, int inserted, change old) { |
| 566 | this.line0 = line0; |
| 567 | this.line1 = line1; |
| 568 | this.inserted = inserted; |
| 569 | this.deleted = deleted; |
| 570 | this.link = old; |
| 571 | //System.err.println(line0+","+line1+","+inserted+","+deleted); |
| 572 | } |
| 573 | } |
| 574 | |
| 575 | /** Data on one input file being compared. |
| 576 | */ |
| 577 | |
| 578 | class file_data { |
| 579 | |
| 580 | /** Allocate changed array for the results of comparison. */ |
| 581 | void clear() { |
| 582 | /* Allocate a flag for each line of each file, saying whether that line |
| 583 | is an insertion or deletion. |
| 584 | Allocate an extra element, always zero, at each end of each vector. |
| 585 | */ |
| 586 | changed_flag = new boolean[buffered_lines + 2]; |
| 587 | } |
| 588 | |
| 589 | /** Return equiv_count[I] as the number of lines in this file |
| 590 | that fall in equivalence class I. |
| 591 | @return the array of equivalence class counts. |
| 592 | */ |
| 593 | int[] equivCount() { |
| 594 | int[] equiv_count = new int[equiv_max]; |
| 595 | for (int i = 0; i < buffered_lines; ++i) { |
| 596 | ++equiv_count[equivs[i]]; |
| 597 | } |
| 598 | return equiv_count; |
| 599 | } |
| 600 | |
| 601 | /** Discard lines that have no matches in another file. |
| 602 | |
| 603 | A line which is discarded will not be considered by the actual |
| 604 | comparison algorithm; it will be as if that line were not in the file. |
| 605 | The file's `realindexes' table maps virtual line numbers |
| 606 | (which don't count the discarded lines) into real line numbers; |
| 607 | this is how the actual comparison algorithm produces results |
| 608 | that are comprehensible when the discarded lines are counted. |
| 609 | <p> |
| 610 | When we discard a line, we also mark it as a deletion or insertion |
| 611 | so that it will be printed in the output. |
| 612 | @param f the other file |
| 613 | */ |
| 614 | void discard_confusing_lines(file_data f) { |
| 615 | clear(); |
| 616 | /* Set up table of which lines are going to be discarded. */ |
| 617 | final byte[] discarded = discardable(f.equivCount()); |
| 618 | |
| 619 | /* Don't really discard the provisional lines except when they occur |
| 620 | in a run of discardables, with nonprovisionals at the beginning |
| 621 | and end. */ |
| 622 | filterDiscards(discarded); |
| 623 | |
| 624 | /* Actually discard the lines. */ |
| 625 | discard(discarded); |
| 626 | } |
| 627 | |
| 628 | /** Mark to be discarded each line that matches no line of another file. |
| 629 | If a line matches many lines, mark it as provisionally discardable. |
| 630 | @see equivCount() |
| 631 | @param counts The count of each equivalence number for the other file. |
| 632 | @return 0=nondiscardable, 1=discardable or 2=provisionally discardable |
| 633 | for each line |
| 634 | */ |
| 635 | |
| 636 | private byte[] discardable(final int[] counts) { |
| 637 | final int end = buffered_lines; |
| 638 | final byte[] discards = new byte[end]; |
| 639 | final int[] equivs = this.equivs; |
| 640 | int many = 5; |
| 641 | int tem = end / 64; |
| 642 | |
| 643 | /* Multiply MANY by approximate square root of number of lines. |
| 644 | That is the threshold for provisionally discardable lines. */ |
| 645 | while ((tem = tem >> 2) > 0) { |
| 646 | many *= 2; |
| 647 | } |
| 648 | |
| 649 | for (int i = 0; i < end; i++) |
| 650 | { |
| 651 | int nmatch; |
| 652 | if (equivs[i] == 0) { |
| 653 | continue; |
| 654 | } |
| 655 | nmatch = counts[equivs[i]]; |
| 656 | if (nmatch == 0) { |
| 657 | discards[i] = 1; |
| 658 | } else if (nmatch > many) { |
| 659 | discards[i] = 2; |
| 660 | } |
| 661 | } |
| 662 | return discards; |
| 663 | } |
| 664 | |
| 665 | /** Don't really discard the provisional lines except when they occur |
| 666 | in a run of discardables, with nonprovisionals at the beginning |
| 667 | and end. */ |
| 668 | |
| 669 | private void filterDiscards(final byte[] discards) { |
| 670 | final int end = buffered_lines; |
| 671 | |
| 672 | for (int i = 0; i < end; i++) |
| 673 | { |
| 674 | /* Cancel provisional discards not in middle of run of discards. */ |
| 675 | if (discards[i] == 2) { |
| 676 | discards[i] = 0; |
| 677 | } else if (discards[i] != 0) |
| 678 | { |
| 679 | /* We have found a nonprovisional discard. */ |
| 680 | int j; |
| 681 | int length; |
| 682 | int provisional = 0; |
| 683 | |
| 684 | /* Find end of this run of discardable lines. |
| 685 | Count how many are provisionally discardable. */ |
| 686 | for (j = i; j < end; j++) |
| 687 | { |
| 688 | if (discards[j] == 0) { |
| 689 | break; |
| 690 | } |
| 691 | if (discards[j] == 2) { |
| 692 | ++provisional; |
| 693 | } |
| 694 | } |
| 695 | |
| 696 | /* Cancel provisional discards at end, and shrink the run. */ |
| 697 | while (j > i && discards[j - 1] == 2) { |
| 698 | discards[--j] = 0; --provisional; |
| 699 | } |
| 700 | |
| 701 | /* Now we have the length of a run of discardable lines |
| 702 | whose first and last are not provisional. */ |
| 703 | length = j - i; |
| 704 | |
| 705 | /* If 1/4 of the lines in the run are provisional, |
| 706 | cancel discarding of all provisional lines in the run. */ |
| 707 | if (provisional * 4 > length) |
| 708 | { |
| 709 | while (j > i) |
| 710 | if (discards[--j] == 2) { |
| 711 | discards[j] = 0; |
| 712 | } |
| 713 | } |
| 714 | else |
| 715 | { |
| 716 | int consec; |
| 717 | int minimum = 1; |
| 718 | int tem = length / 4; |
| 719 | |
| 720 | /* MINIMUM is approximate square root of LENGTH/4. |
| 721 | A subrun of two or more provisionals can stand |
| 722 | when LENGTH is at least 16. |
| 723 | A subrun of 4 or more can stand when LENGTH >= 64. */ |
| 724 | while ((tem = tem >> 2) > 0) { |
| 725 | minimum *= 2; |
| 726 | } |
| 727 | minimum++; |
| 728 | |
| 729 | /* Cancel any subrun of MINIMUM or more provisionals |
| 730 | within the larger run. */ |
| 731 | for (j = 0, consec = 0; j < length; j++) |
| 732 | if (discards[i + j] != 2) { |
| 733 | consec = 0; |
| 734 | } else if (minimum == ++consec) { |
| 735 | /* Back up to start of subrun, to cancel it all. */ |
| 736 | j -= consec; |
| 737 | } else if (minimum < consec) { |
| 738 | discards[i + j] = 0; |
| 739 | } |
| 740 | |
| 741 | /* Scan from beginning of run |
| 742 | until we find 3 or more nonprovisionals in a row |
| 743 | or until the first nonprovisional at least 8 lines in. |
| 744 | Until that point, cancel any provisionals. */ |
| 745 | for (j = 0, consec = 0; j < length; j++) |
| 746 | { |
| 747 | if (j >= 8 && discards[i + j] == 1) { |
| 748 | break; |
| 749 | } |
| 750 | if (discards[i + j] == 2) { |
| 751 | consec = 0; discards[i + j] = 0; |
| 752 | } |
| 753 | else if (discards[i + j] == 0) { |
| 754 | consec = 0; |
| 755 | } else { |
| 756 | consec++; |
| 757 | } |
| 758 | if (consec == 3) { |
| 759 | break; |
| 760 | } |
| 761 | } |
| 762 | |
| 763 | /* I advances to the last line of the run. */ |
| 764 | i += length - 1; |
| 765 | |
| 766 | /* Same thing, from end. */ |
| 767 | for (j = 0, consec = 0; j < length; j++) |
| 768 | { |
| 769 | if (j >= 8 && discards[i - j] == 1) { |
| 770 | break; |
| 771 | } |
| 772 | if (discards[i - j] == 2) { |
| 773 | consec = 0; discards[i - j] = 0; |
| 774 | } |
| 775 | else if (discards[i - j] == 0) { |
| 776 | consec = 0; |
| 777 | } else { |
| 778 | consec++; |
| 779 | } |
| 780 | if (consec == 3) { |
| 781 | break; |
| 782 | } |
| 783 | } |
| 784 | } |
| 785 | } |
| 786 | } |
| 787 | } |
| 788 | |
| 789 | /** Actually discard the lines. |
| 790 | @param discards flags lines to be discarded |
| 791 | */ |
| 792 | private void discard(final byte[] discards) { |
| 793 | final int end = buffered_lines; |
| 794 | int j = 0; |
| 795 | for (int i = 0; i < end; ++i) |
| 796 | if (no_discards || discards[i] == 0) |
| 797 | { |
| 798 | undiscarded[j] = equivs[i]; |
| 799 | realindexes[j++] = i; |
| 800 | } else { |
| 801 | changed_flag[1+i] = true; |
| 802 | } |
| 803 | nondiscarded_lines = j; |
| 804 | } |
| 805 | |
| 806 | file_data(Object[] data,Hashtable h) { |
| 807 | buffered_lines = data.length; |
| 808 | |
| 809 | equivs = new int[buffered_lines]; |
| 810 | undiscarded = new int[buffered_lines]; |
| 811 | realindexes = new int[buffered_lines]; |
| 812 | |
| 813 | for (int i = 0; i < data.length; ++i) { |
| 814 | Integer ir = (Integer)h.get(data[i]); |
| 815 | if (ir == null) { |
| 816 | h.put(data[i],new Integer(equivs[i] = equiv_max++)); |
| 817 | } else { |
| 818 | equivs[i] = ir.intValue(); |
| 819 | } |
| 820 | } |
| 821 | } |
| 822 | |
| 823 | /** Adjust inserts/deletes of blank lines to join changes |
| 824 | as much as possible. |
| 825 | |
| 826 | We do something when a run of changed lines include a blank |
| 827 | line at one end and have an excluded blank line at the other. |
| 828 | We are free to choose which blank line is included. |
| 829 | `compareseq' always chooses the one at the beginning, |
| 830 | but usually it is cleaner to consider the following blank line |
| 831 | to be the "change". The only exception is if the preceding blank line |
| 832 | would join this change to other changes. |
| 833 | @param f the file being compared against |
| 834 | */ |
| 835 | |
| 836 | void shift_boundaries(file_data f) { |
| 837 | final boolean[] changed = changed_flag; |
| 838 | final boolean[] other_changed = f.changed_flag; |
| 839 | int i = 0; |
| 840 | int j = 0; |
| 841 | int i_end = buffered_lines; |
| 842 | int preceding = -1; |
| 843 | int other_preceding = -1; |
| 844 | |
| 845 | for (;;) |
| 846 | { |
| 847 | int start, end, other_start; |
| 848 | |
| 849 | /* Scan forwards to find beginning of another run of changes. |
| 850 | Also keep track of the corresponding point in the other file. */ |
| 851 | |
| 852 | while (i < i_end && !changed[1+i]) |
| 853 | { |
| 854 | while (other_changed[1+j++]) { |
| 855 | /* Non-corresponding lines in the other file |
| 856 | will count as the preceding batch of changes. */ |
| 857 | other_preceding = j; |
| 858 | } |
| 859 | i++; |
| 860 | } |
| 861 | |
| 862 | if (i == i_end) { |
| 863 | break; |
| 864 | } |
| 865 | |
| 866 | start = i; |
| 867 | other_start = j; |
| 868 | |
| 869 | for (;;) |
| 870 | { |
| 871 | /* Now find the end of this run of changes. */ |
| 872 | |
| 873 | while (i < i_end && changed[1+i]) { |
| 874 | i++; |
| 875 | } |
| 876 | end = i; |
| 877 | |
| 878 | /* If the first changed line matches the following unchanged one, |
| 879 | and this run does not follow right after a previous run, |
| 880 | and there are no lines deleted from the other file here, |
| 881 | then classify the first changed line as unchanged |
| 882 | and the following line as changed in its place. */ |
| 883 | |
| 884 | /* You might ask, how could this run follow right after another? |
| 885 | Only because the previous run was shifted here. */ |
| 886 | |
| 887 | if (end != i_end |
| 888 | && equivs[start] == equivs[end] |
| 889 | && !other_changed[1+j] |
| 890 | && end != i_end |
| 891 | && !((preceding >= 0 && start == preceding) |
| 892 | || (other_preceding >= 0 |
| 893 | && other_start == other_preceding))) |
| 894 | { |
| 895 | changed[1+end++] = true; |
| 896 | changed[1+start++] = false; |
| 897 | ++i; |
| 898 | /* Since one line-that-matches is now before this run |
| 899 | instead of after, we must advance in the other file |
| 900 | to keep in synch. */ |
| 901 | ++j; |
| 902 | } else { |
| 903 | break; |
| 904 | } |
| 905 | } |
| 906 | |
| 907 | preceding = i; |
| 908 | other_preceding = j; |
| 909 | } |
| 910 | } |
| 911 | |
| 912 | /** Number of elements (lines) in this file. */ |
| 913 | final int buffered_lines; |
| 914 | |
| 915 | /** Vector, indexed by line number, containing an equivalence code for |
| 916 | each line. It is this vector that is actually compared with that |
| 917 | of another file to generate differences. */ |
| 918 | private final int[] equivs; |
| 919 | |
| 920 | /** Vector, like the previous one except that |
| 921 | the elements for discarded lines have been squeezed out. */ |
| 922 | final int[] undiscarded; |
| 923 | |
| 924 | /** Vector mapping virtual line numbers (not counting discarded lines) |
| 925 | to real ones (counting those lines). Both are origin-0. */ |
| 926 | final int[] realindexes; |
| 927 | |
| 928 | /** Total number of nondiscarded lines. */ |
| 929 | int nondiscarded_lines; |
| 930 | |
| 931 | /** Array, indexed by real origin-1 line number, |
| 932 | containing true for a line that is an insertion or a deletion. |
| 933 | The results of comparison are stored here. */ |
| 934 | boolean[] changed_flag; |
| 935 | |
| 936 | } |
| 937 | } |