Context Navigation

source: josm/trunk/src/org/openstreetmap/josm/tools/Diff.java@ 9519

Last change on this file since 9519 was 9231, checked in by Don-vip, 8 years ago
javadoc update
Property svn:eol-style set to `native`
File size: 36.7 KB

Line
1	// License: GPL. For details, see LICENSE file.
2	package org.openstreetmap.josm.tools;
3
4	//// Taken from http://www.bmsi.com/java/#diff
5
6	// http://www.bmsi.com/java/DiffPrint.java could also be useful
7
8	/*
9	* $Log: Diff.java,v $
10	* Revision 1.15 2013/04/01 16:27:31 stuart
11	* Fix DiffPrint unified format with test cases.
12	* Begin porting some diff-2.7 features.
13	*
14	* Revision 1.14 2010/03/03 21:21:25 stuart
15	* Test new direct equivalence API
16	*
17	* Revision 1.13 2009/12/07 17:43:17 stuart
18	* Compute equivMax for int[] ctor
19	*
20	* Revision 1.12 2009/12/07 17:34:46 stuart
21	* Ctor with int[].
22	*
23	* Revision 1.11 2009/11/15 01:11:54 stuart
24	* Diff doesn't need to be generic
25	*
26	* Revision 1.10 2009/11/15 00:54:03 stuart
27	* Update to Java 5 containers
28	*
29	* Revision 1.7 2009/01/19 03:05:26 stuart
30	* Fix StackOverflow bug with heuristic on reported by Jimmy Han.
31	*
32	* Revision 1.6 2003/03/06 22:51:32 stuart
33	* Convert to CVS
34	*
35	* Revision 1.5 2002/07/19 19:14:40 stuart
36	* fix reverseScript, make change ctor public, update docs
37	*
38	* Revision 1.4 2002/04/09 17:53:39 stuart
39	* More flexible interface for diff() function.
40	*
41	* Revision 1.3 2000/03/03 21:58:03 stuart
42	* move discard_confusing_lines and shift_boundaries to class file_data
43	*
44	* Revision 1.2 2000/03/02 16:37:38 stuart
45	* Add GPL and copyright
46	*
47	*/
48
49	import java.util.HashMap;
50	import java.util.Map;
51
52	/** A class to compare vectors of objects. The result of comparison
53	is a list of <code>change</code> objects which form an
54	edit script. The objects compared are traditionally lines
55	of text from two files. Comparison options such as "ignore
56	whitespace" are implemented by modifying the <code>equals</code>
57	and <code>hashcode</code> methods for the objects compared.
58	<p>
59	The basic algorithm is described in: <br>
60	"An O(ND) Difference Algorithm and its Variations", Eugene Myers,
61	Algorithmica Vol. 1 No. 2, 1986, p 251.
62	<p>
63	This class outputs different results from GNU diff 1.15 on some
64	inputs. Our results are actually better (smaller change list, smaller
65	total size of changes), but it would be nice to know why. Perhaps
66	there is a memory overwrite bug in GNU diff 1.15.
67
68	@author Stuart D. Gathman, translated from GNU diff 1.15
69	Copyright (C) 2000 Business Management Systems, Inc.
70	<p>
71	This program is free software; you can redistribute it and/or modify
72	it under the terms of the GNU General Public License as published by
73	the Free Software Foundation; either version 1, or (at your option)
74	any later version.
75	<p>
76	This program is distributed in the hope that it will be useful,
77	but WITHOUT ANY WARRANTY; without even the implied warranty of
78	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
79	GNU General Public License for more details.
80	<p>
81	You should have received a copy of the <a href=COPYING.txt>
82	GNU General Public License</a>
83	along with this program; if not, write to the Free Software
84	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
85	*/
86	public class Diff {
87
88	/** Prepare to find differences between two arrays. Each element of
89	the arrays is translated to an "equivalence number" based on
90	the result of <code>equals</code>. The original Object arrays
91	are no longer needed for computing the differences. They will
92	be needed again later to print the results of the comparison as
93	an edit script, if desired.
94	* @param a first array
95	* @param b second array
96	*/
97	public Diff(Object[] a, Object[] b) {
98	Map<Object, Integer> h = new HashMap<>(a.length + b.length);
99	filevec = new FileData[] {new FileData(a, h), new FileData(b, h)};
100	}
101
102	/** 1 more than the maximum equivalence value used for this or its
103	sibling file. */
104	private int equivMax = 1;
105
106	/** When set to true, the comparison uses a heuristic to speed it up.
107	With this heuristic, for files with a constant small density
108	of changes, the algorithm is linear in the file size. */
109	public boolean heuristic;
110
111	/** When set to true, the algorithm returns a guarranteed minimal
112	set of changes. This makes things slower, sometimes much slower. */
113	public boolean noDiscards;
114
115	private int[] xvec, yvec; /* Vectors being compared. */
116	private int[] fdiag; /* Vector, indexed by diagonal, containing
117	the X coordinate of the point furthest
118	along the given diagonal in the forward
119	search of the edit matrix. */
120	private int[] bdiag; /* Vector, indexed by diagonal, containing
121	the X coordinate of the point furthest
122	along the given diagonal in the backward
123	search of the edit matrix. */
124	private int fdiagoff, bdiagoff;
125	private final FileData[] filevec;
126	private int cost;
127	/** Snakes bigger than this are considered "big". */
128	private static final int SNAKE_LIMIT = 20;
129
130	/**
131	* Find the midpoint of the shortest edit script for a specified
132	* portion of the two files.
133	*
134	* We scan from the beginnings of the files, and simultaneously from the ends,
135	* doing a breadth-first search through the space of edit-sequence.
136	* When the two searches meet, we have found the midpoint of the shortest
137	* edit sequence.
138	*
139	* The value returned is the number of the diagonal on which the midpoint lies.
140	* The diagonal number equals the number of inserted lines minus the number
141	* of deleted lines (counting only lines before the midpoint).
142	* The edit cost is stored into COST; this is the total number of
143	* lines inserted or deleted (counting only lines before the midpoint).
144	*
145	* This function assumes that the first lines of the specified portions
146	* of the two files do not match, and likewise that the last lines do not
147	* match. The caller must trim matching lines from the beginning and end
148	* of the portions it is going to specify.
149	*
150	* Note that if we return the "wrong" diagonal value, or if
151	* the value of bdiag at that diagonal is "wrong",
152	* the worst this can do is cause suboptimal diff output.
153	* It cannot cause incorrect diff output.
154	* @param xoff xoff
155	* @param xlim xlim
156	* @param yoff yoff
157	* @param ylim ylim
158	* @return midpoint of the shortest edit script
159	*/
160	private int diag(int xoff, int xlim, int yoff, int ylim) {
161	final int[] fd = fdiag; // Give the compiler a chance.
162	final int[] bd = bdiag; // Additional help for the compiler.
163	final int[] xv = xvec; // Still more help for the compiler.
164	final int[] yv = yvec; // And more and more . . .
165	final int dmin = xoff - ylim; // Minimum valid diagonal.
166	final int dmax = xlim - yoff; // Maximum valid diagonal.
167	final int fmid = xoff - yoff; // Center diagonal of top-down search.
168	final int bmid = xlim - ylim; // Center diagonal of bottom-up search.
169	int fmin = fmid, fmax = fmid; // Limits of top-down search.
170	int bmin = bmid, bmax = bmid; // Limits of bottom-up search.
171	// True if southeast corner is on an odd diagonal with respect to the northwest.
172	final boolean odd = (fmid - bmid & 1) != 0;
173
174	fd[fdiagoff + fmid] = xoff;
175	bd[bdiagoff + bmid] = xlim;
176
177	for (int c = 1;; ++c) {
178	int d; /* Active diagonal. */
179	boolean big_snake = false;
180
181	/* Extend the top-down search by an edit step in each diagonal. */
182	if (fmin > dmin) {
183	fd[fdiagoff + --fmin - 1] = -1;
184	} else {
185	++fmin;
186	}
187	if (fmax < dmax) {
188	fd[fdiagoff + ++fmax + 1] = -1;
189	} else {
190	--fmax;
191	}
192	for (d = fmax; d >= fmin; d -= 2) {
193	int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1];
194
195	if (tlo >= thi) {
196	x = tlo + 1;
197	} else {
198	x = thi;
199	}
200	oldx = x;
201	y = x - d;
202	while (x < xlim && y < ylim && xv[x] == yv[y]) {
203	++x; ++y;
204	}
205	if (x - oldx > SNAKE_LIMIT) {
206	big_snake = true;
207	}
208	fd[fdiagoff + d] = x;
209	if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) {
210	cost = 2 * c - 1;
211	return d;
212	}
213	}
214
215	/* Similar extend the bottom-up search. */
216	if (bmin > dmin) {
217	bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE;
218	} else {
219	++bmin;
220	}
221	if (bmax < dmax) {
222	bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE;
223	} else {
224	--bmax;
225	}
226	for (d = bmax; d >= bmin; d -= 2) {
227	int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1];
228
229	if (tlo < thi) {
230	x = tlo;
231	} else {
232	x = thi - 1;
233	}
234	oldx = x;
235	y = x - d;
236	while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
237	--x; --y;
238	}
239	if (oldx - x > SNAKE_LIMIT) {
240	big_snake = true;
241	}
242	bd[bdiagoff + d] = x;
243	if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) {
244	cost = 2 * c;
245	return d;
246	}
247	}
248
249	/* Heuristic: check occasionally for a diagonal that has made
250	lots of progress compared with the edit distance.
251	If we have any such, find the one that has made the most
252	progress and return it as if it had succeeded.
253
254	With this heuristic, for files with a constant small density
255	of changes, the algorithm is linear in the file size. */
256
257	if (c > 200 && big_snake && heuristic) {
258	int best = 0;
259	int bestpos = -1;
260
261	for (d = fmax; d >= fmin; d -= 2) {
262	int dd = d - fmid;
263	int x = fd[fdiagoff + d];
264	int y = x - d;
265	int v = (x - xoff) * 2 - dd;
266	if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
267	if (v > best
268	&& xoff + SNAKE_LIMIT <= x && x < xlim
269	&& yoff + SNAKE_LIMIT <= y && y < ylim) {
270	/* We have a good enough best diagonal.
271	now insist that it end with a significant snake. */
272	int k;
273
274	for (k = 1; xvec[x - k] == yvec[y - k]; k++) {
275	if (k == SNAKE_LIMIT) {
276	best = v;
277	bestpos = d;
278	break;
279	}
280	}
281	}
282	}
283	}
284	if (best > 0) {
285	cost = 2 * c - 1;
286	return bestpos;
287	}
288
289	best = 0;
290	for (d = bmax; d >= bmin; d -= 2) {
291	int dd = d - bmid;
292	int x = bd[bdiagoff + d];
293	int y = x - d;
294	int v = (xlim - x) * 2 + dd;
295	if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
296	if (v > best
297	&& xoff < x && x <= xlim - SNAKE_LIMIT
298	&& yoff < y && y <= ylim - SNAKE_LIMIT) {
299	/* We have a good enough best diagonal.
300	now insist that it end with a significant snake. */
301	int k;
302
303	for (k = 0; xvec[x + k] == yvec[y + k]; k++) {
304	if (k == SNAKE_LIMIT) {
305	best = v;
306	bestpos = d;
307	break;
308	}
309	}
310	}
311	}
312	}
313	if (best > 0) {
314	cost = 2 * c - 1;
315	return bestpos;
316	}
317	}
318	}
319	}
320
321	/**
322	* Compare in detail contiguous subsequences of the two files
323	* which are known, as a whole, to match each other.
324	*
325	* The results are recorded in the vectors filevec[N].changed_flag, by
326	* storing a 1 in the element for each line that is an insertion or deletion.
327	*
328	* The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
329	*
330	* Note that XLIM, YLIM are exclusive bounds.
331	* All line numbers are origin-0 and discarded lines are not counted.
332	* @param xoff xoff
333	* @param xlim xlim
334	* @param yoff yoff
335	* @param ylim ylim
336	*/
337	private void compareseq(int xoff, int xlim, int yoff, int ylim) {
338	/* Slide down the bottom initial diagonal. */
339	while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) {
340	++xoff; ++yoff;
341	}
342	/* Slide up the top initial diagonal. */
343	while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) {
344	--xlim; --ylim;
345	}
346
347	/* Handle simple cases. */
348	if (xoff == xlim) {
349	while (yoff < ylim) {
350	filevec[1].changedFlag[1+filevec[1].realindexes[yoff++]] = true;
351	}
352	} else if (yoff == ylim) {
353	while (xoff < xlim) {
354	filevec[0].changedFlag[1+filevec[0].realindexes[xoff++]] = true;
355	}
356	} else {
357	/* Find a point of correspondence in the middle of the files. */
358
359	int d = diag(xoff, xlim, yoff, ylim);
360	int c = cost;
361	int b = bdiag[bdiagoff + d];
362
363	if (c == 1)
364	/* This should be impossible, because it implies that
365	one of the two subsequences is empty,
366	and that case was handled above without calling `diag'.
367	Let's verify that this is true. */
368	throw new IllegalArgumentException("Empty subsequence");
369	else {
370	/* Use that point to split this problem into two subproblems. */
371	compareseq(xoff, b, yoff, b - d);
372	/* This used to use f instead of b,
373	but that is incorrect!
374	It is not necessarily the case that diagonal d
375	has a snake from b to f. */
376	compareseq(b, xlim, b - d, ylim);
377	}
378	}
379	}
380
381	/** Discard lines from one file that have no matches in the other file.
382	*/
383	private void discard_confusing_lines() {
384	filevec[0].discard_confusing_lines(filevec[1]);
385	filevec[1].discard_confusing_lines(filevec[0]);
386	}
387
388	private boolean inhibit;
389
390	/**
391	* Adjust inserts/deletes of blank lines to join changes as much as possible.
392	*/
393	private void shift_boundaries() {
394	if (inhibit)
395	return;
396	filevec[0].shift_boundaries(filevec[1]);
397	filevec[1].shift_boundaries(filevec[0]);
398	}
399
400	/**
401	* Script builder.
402	*/
403	public interface ScriptBuilder {
404	/**
405	* Scan the tables of which lines are inserted and deleted, producing an edit script.
406	* @param changed0 true for lines in first file which do not match 2nd
407	* @param len0 number of lines in first file
408	* @param changed1 true for lines in 2nd file which do not match 1st
409	* @param len1 number of lines in 2nd file
410	* @return a linked list of changes - or null
411	*/
412	Change build_script(
413	boolean[] changed0, int len0,
414	boolean[] changed1, int len1
415	);
416	}
417
418	/** Scan the tables of which lines are inserted and deleted,
419	producing an edit script in reverse order. */
420
421	static class ReverseScript implements ScriptBuilder {
422	@Override
423	public Change build_script(
424	final boolean[] changed0, int len0,
425	final boolean[] changed1, int len1) {
426	Change script = null;
427	int i0 = 0, i1 = 0;
428	while (i0 < len0 \|\| i1 < len1) {
429	if (changed0[1+i0] \|\| changed1[1+i1]) {
430	int line0 = i0, line1 = i1;
431
432	/* Find # lines changed here in each file. */
433	while (changed0[1+i0]) {
434	++i0;
435	}
436	while (changed1[1+i1]) {
437	++i1;
438	}
439
440	/* Record this change. */
441	script = new Change(line0, line1, i0 - line0, i1 - line1, script);
442	}
443
444	/* We have reached lines in the two files that match each other. */
445	i0++; i1++;
446	}
447
448	return script;
449	}
450	}
451
452	static class ForwardScript implements ScriptBuilder {
453	/** Scan the tables of which lines are inserted and deleted,
454	producing an edit script in forward order. */
455	@Override
456	public Change build_script(
457	final boolean[] changed0, int len0,
458	final boolean[] changed1, int len1) {
459	Change script = null;
460	int i0 = len0, i1 = len1;
461
462	while (i0 >= 0 \|\| i1 >= 0) {
463	if (changed0[i0] \|\| changed1[i1]) {
464	int line0 = i0, line1 = i1;
465
466	/* Find # lines changed here in each file. */
467	while (changed0[i0]) {
468	--i0;
469	}
470	while (changed1[i1]) {
471	--i1;
472	}
473
474	/* Record this change. */
475	script = new Change(i0, i1, line0 - i0, line1 - i1, script);
476	}
477
478	/* We have reached lines in the two files that match each other. */
479	i0--; i1--;
480	}
481
482	return script;
483	}
484	}
485
486	/** Standard Forward ScriptBuilder. */
487	public static final ScriptBuilder forwardScript = new ForwardScript();
488	/** Standard Reverse ScriptBuilder. */
489	public static final ScriptBuilder reverseScript = new ReverseScript();
490
491	/**
492	* Report the differences of two files. DEPTH is the current directory depth.
493	* @param reverse if {@code true} use {@link #reverseScript} else use {@link #forwardScript}
494	* @return the differences of two files
495	*/
496	public final Change diff_2(final boolean reverse) {
497	return diff(reverse ? reverseScript : forwardScript);
498	}
499
500	/**
501	* Get the results of comparison as an edit script. The script
502	* is described by a list of changes. The standard ScriptBuilder
503	* implementations provide for forward and reverse edit scripts.
504	* Alternate implementations could, for instance, list common elements
505	* instead of differences.
506	* @param bld an object to build the script from change flags
507	* @return the head of a list of changes
508	*/
509	public Change diff(final ScriptBuilder bld) {
510
511	// Some lines are obviously insertions or deletions because they don't match anything.
512	// Detect them now, and avoid even thinking about them in the main comparison algorithm.
513	discard_confusing_lines();
514
515	// Now do the main comparison algorithm, considering just the undiscarded lines.
516	xvec = filevec[0].undiscarded;
517	yvec = filevec[1].undiscarded;
518
519	int diags = filevec[0].nondiscardedLines + filevec[1].nondiscardedLines + 3;
520	fdiag = new int[diags];
521	fdiagoff = filevec[1].nondiscardedLines + 1;
522	bdiag = new int[diags];
523	bdiagoff = filevec[1].nondiscardedLines + 1;
524
525	compareseq(0, filevec[0].nondiscardedLines,
526	0, filevec[1].nondiscardedLines);
527	fdiag = null;
528	bdiag = null;
529
530	// Modify the results slightly to make them prettier in cases where that can validly be done.
531	shift_boundaries();
532
533	// Get the results of comparison in the form of a chain of `struct change's -- an edit script.
534	return bld.build_script(
535	filevec[0].changedFlag,
536	filevec[0].bufferedLines,
537	filevec[1].changedFlag,
538	filevec[1].bufferedLines
539	);
540	}
541
542	/** The result of comparison is an "edit script": a chain of change objects.
543	Each change represents one place where some lines are deleted
544	and some are inserted.
545
546	LINE0 and LINE1 are the first affected lines in the two files (origin 0).
547	DELETED is the number of lines deleted here from file 0.
548	INSERTED is the number of lines inserted here in file 1.
549
550	If DELETED is 0 then LINE0 is the number of the line before
551	which the insertion was done; vice versa for INSERTED and LINE1. */
552
553	public static class Change {
554	/** Previous or next edit command. */
555	public Change link;
556	/** # lines of file 1 changed here. */
557	public final int inserted;
558	/** # lines of file 0 changed here. */
559	public final int deleted;
560	/** Line number of 1st deleted line. */
561	public final int line0;
562	/** Line number of 1st inserted line. */
563	public final int line1;
564
565	/**
566	* Cons an additional entry onto the front of an edit script OLD.
567	* LINE0 and LINE1 are the first affected lines in the two files (origin 0).
568	* DELETED is the number of lines deleted here from file 0.
569	* INSERTED is the number of lines inserted here in file 1.
570	*
571	* If DELETED is 0 then LINE0 is the number of the line before
572	* which the insertion was done; vice versa for INSERTED and LINE1.
573	* @param line0 first affected lines in the two files (origin 0)
574	* @param line1 first affected lines in the two files (origin 0)
575	* @param deleted the number of lines deleted here from file 0
576	* @param inserted the number of lines inserted here in file 1
577	* @param old edit script
578	*/
579	public Change(int line0, int line1, int deleted, int inserted, Change old) {
580	this.line0 = line0;
581	this.line1 = line1;
582	this.inserted = inserted;
583	this.deleted = deleted;
584	this.link = old;
585	}
586
587	/**
588	* Returns the number of insertions and deletions of this change as well as
589	* (recursively) the changes linked via {@link #link}.
590	* @return recursive number of insertions and deletions
591	*/
592	public int getTotalNumberOfChanges() {
593	return inserted + deleted + (link != null ? link.getTotalNumberOfChanges() : 0);
594	}
595
596	@Override
597	public String toString() {
598	String s = String.format("%d -%d +%d %d", line0, deleted, inserted, line1);
599	return (link != null) ? s = s + '\n' + link : s;
600	}
601	}
602
603	/**
604	* Data on one input file being compared.
605	*/
606	class FileData {
607
608	/** Allocate changed array for the results of comparison. */
609	void clear() {
610	// Allocate a flag for each line of each file, saying whether that line is an insertion or deletion.
611	// Allocate an extra element, always zero, at each end of each vector.
612	changedFlag = new boolean[bufferedLines + 2];
613	}
614
615	/**
616	* Return equiv_count[I] as the number of lines in this file that fall in equivalence class I.
617	* @return the array of equivalence class counts.
618	*/
619	int[] equivCount() {
620	int[] equiv_count = new int[equivMax];
621	for (int i = 0; i < bufferedLines; ++i) {
622	++equiv_count[equivs[i]];
623	}
624	return equiv_count;
625	}
626
627	/**
628	* Discard lines that have no matches in another file.
629	*
630	* A line which is discarded will not be considered by the actual comparison algorithm;
631	* it will be as if that line were not in the file.
632	* The file's `realindexes' table maps virtual line numbers
633	* (which don't count the discarded lines) into real line numbers;
634	* this is how the actual comparison algorithm produces results
635	* that are comprehensible when the discarded lines are counted.
636	* <p>
637	* When we discard a line, we also mark it as a deletion or insertion so that it will be printed in the output.
638	* @param f the other file
639	*/
640	void discard_confusing_lines(FileData f) {
641	clear();
642	// Set up table of which lines are going to be discarded.
643	final byte[] discarded = discardable(f.equivCount());
644
645	// Don't really discard the provisional lines except when they occur in a run of discardables,
646	// with nonprovisionals at the beginning and end.
647	filterDiscards(discarded);
648
649	// Actually discard the lines.
650	discard(discarded);
651	}
652
653	/**
654	* Mark to be discarded each line that matches no line of another file.
655	* If a line matches many lines, mark it as provisionally discardable.
656	* @param counts The count of each equivalence number for the other file.
657	* @return 0=nondiscardable, 1=discardable or 2=provisionally discardable for each line
658	* @see #equivCount()
659	*/
660	private byte[] discardable(final int[] counts) {
661	final int end = bufferedLines;
662	final byte[] discards = new byte[end];
663	final int[] equivs = this.equivs;
664	int many = 5;
665	int tem = end / 64;
666
667	/* Multiply MANY by approximate square root of number of lines.
668	That is the threshold for provisionally discardable lines. */
669	while ((tem = tem >> 2) > 0) {
670	many *= 2;
671	}
672
673	for (int i = 0; i < end; i++) {
674	int nmatch;
675	if (equivs[i] == 0) {
676	continue;
677	}
678	nmatch = counts[equivs[i]];
679	if (nmatch == 0) {
680	discards[i] = 1;
681	} else if (nmatch > many) {
682	discards[i] = 2;
683	}
684	}
685	return discards;
686	}
687
688	/**
689	* Don't really discard the provisional lines except when they occur
690	* in a run of discardables, with nonprovisionals at the beginning and end.
691	* @param discards discards
692	*/
693	private void filterDiscards(final byte[] discards) {
694	final int end = bufferedLines;
695
696	for (int i = 0; i < end; i++) {
697	/* Cancel provisional discards not in middle of run of discards. */
698	if (discards[i] == 2) {
699	discards[i] = 0;
700	} else if (discards[i] != 0) {
701	/* We have found a nonprovisional discard. */
702	int j;
703	int length;
704	int provisional = 0;
705
706	/* Find end of this run of discardable lines.
707	Count how many are provisionally discardable. */
708	for (j = i; j < end; j++) {
709	if (discards[j] == 0) {
710	break;
711	}
712	if (discards[j] == 2) {
713	++provisional;
714	}
715	}
716
717	/* Cancel provisional discards at end, and shrink the run. */
718	while (j > i && discards[j - 1] == 2) {
719	discards[--j] = 0; --provisional;
720	}
721
722	/* Now we have the length of a run of discardable lines
723	whose first and last are not provisional. */
724	length = j - i;
725
726	/* If 1/4 of the lines in the run are provisional,
727	cancel discarding of all provisional lines in the run. */
728	if (provisional * 4 > length) {
729	while (j > i)
730	if (discards[--j] == 2) {
731	discards[j] = 0;
732	}
733	} else {
734	int consec;
735	int minimum = 1;
736	int tem = length / 4;
737
738	/* MINIMUM is approximate square root of LENGTH/4.
739	A subrun of two or more provisionals can stand
740	when LENGTH is at least 16.
741	A subrun of 4 or more can stand when LENGTH >= 64. */
742	while ((tem = tem >> 2) > 0) {
743	minimum *= 2;
744	}
745	minimum++;
746
747	/* Cancel any subrun of MINIMUM or more provisionals
748	within the larger run. */
749	for (j = 0, consec = 0; j < length; j++) {
750	if (discards[i + j] != 2) {
751	consec = 0;
752	} else if (minimum == ++consec) {
753	/* Back up to start of subrun, to cancel it all. */
754	j -= consec;
755	} else if (minimum < consec) {
756	discards[i + j] = 0;
757	}
758	}
759
760	/* Scan from beginning of run
761	until we find 3 or more nonprovisionals in a row
762	or until the first nonprovisional at least 8 lines in.
763	Until that point, cancel any provisionals. */
764	for (j = 0, consec = 0; j < length; j++) {
765	if (j >= 8 && discards[i + j] == 1) {
766	break;
767	}
768	if (discards[i + j] == 2) {
769	consec = 0; discards[i + j] = 0;
770	} else if (discards[i + j] == 0) {
771	consec = 0;
772	} else {
773	consec++;
774	}
775	if (consec == 3) {
776	break;
777	}
778	}
779
780	/* I advances to the last line of the run. */
781	i += length - 1;
782
783	/* Same thing, from end. */
784	for (j = 0, consec = 0; j < length; j++) {
785	if (j >= 8 && discards[i - j] == 1) {
786	break;
787	}
788	if (discards[i - j] == 2) {
789	consec = 0; discards[i - j] = 0;
790	} else if (discards[i - j] == 0) {
791	consec = 0;
792	} else {
793	consec++;
794	}
795	if (consec == 3) {
796	break;
797	}
798	}
799	}
800	}
801	}
802	}
803
804	/** Actually discard the lines.
805	@param discards flags lines to be discarded
806	*/
807	private void discard(final byte[] discards) {
808	final int end = bufferedLines;
809	int j = 0;
810	for (int i = 0; i < end; ++i) {
811	if (noDiscards \|\| discards[i] == 0) {
812	undiscarded[j] = equivs[i];
813	realindexes[j++] = i;
814	} else {
815	changedFlag[1+i] = true;
816	}
817	}
818	nondiscardedLines = j;
819	}
820
821	FileData(int length) {
822	bufferedLines = length;
823	equivs = new int[length];
824	undiscarded = new int[bufferedLines];
825	realindexes = new int[bufferedLines];
826	}
827
828	FileData(Object[] data, Map<Object, Integer> h) {
829	this(data.length);
830	// FIXME: diff 2.7 removes common prefix and common suffix
831	for (int i = 0; i < data.length; ++i) {
832	Integer ir = h.get(data[i]);
833	if (ir == null) {
834	h.put(data[i], equivs[i] = equivMax++);
835	} else {
836	equivs[i] = ir.intValue();
837	}
838	}
839	}
840
841	/** Adjust inserts/deletes of blank lines to join changes
842	as much as possible.
843
844	We do something when a run of changed lines include a blank
845	line at one end and have an excluded blank line at the other.
846	We are free to choose which blank line is included.
847	`compareseq' always chooses the one at the beginning,
848	but usually it is cleaner to consider the following blank line
849	to be the "change". The only exception is if the preceding blank line
850	would join this change to other changes.
851	@param f the file being compared against
852	*/
853	void shift_boundaries(FileData f) {
854	final boolean[] changed = changedFlag;
855	final boolean[] other_changed = f.changedFlag;
856	int i = 0;
857	int j = 0;
858	int i_end = bufferedLines;
859	int preceding = -1;
860	int other_preceding = -1;
861
862	for (;;) {
863	int start, end, other_start;
864
865	/* Scan forwards to find beginning of another run of changes.
866	Also keep track of the corresponding point in the other file. */
867
868	while (i < i_end && !changed[1+i]) {
869	while (other_changed[1+j++]) {
870	/* Non-corresponding lines in the other file
871	will count as the preceding batch of changes. */
872	other_preceding = j;
873	}
874	i++;
875	}
876
877	if (i == i_end) {
878	break;
879	}
880
881	start = i;
882	other_start = j;
883
884	for (;;) {
885	/* Now find the end of this run of changes. */
886
887	while (i < i_end && changed[1+i]) {
888	i++;
889	}
890	end = i;
891
892	/* If the first changed line matches the following unchanged one,
893	and this run does not follow right after a previous run,
894	and there are no lines deleted from the other file here,
895	then classify the first changed line as unchanged
896	and the following line as changed in its place. */
897
898	/* You might ask, how could this run follow right after another?
899	Only because the previous run was shifted here. */
900
901	if (end != i_end && equivs[start] == equivs[end] && !other_changed[1+j]
902	&& !((preceding >= 0 && start == preceding) \|\| (other_preceding >= 0 && other_start == other_preceding))) {
903	changed[1+end++] = true;
904	changed[1+start++] = false;
905	++i;
906	/* Since one line-that-matches is now before this run
907	instead of after, we must advance in the other file
908	to keep in synch. */
909	++j;
910	} else {
911	break;
912	}
913	}
914
915	preceding = i;
916	other_preceding = j;
917	}
918	}
919
920	/** Number of elements (lines) in this file. */
921	private final int bufferedLines;
922
923	/** Vector, indexed by line number, containing an equivalence code for
924	each line. It is this vector that is actually compared with that
925	of another file to generate differences. */
926	private final int[] equivs;
927
928	/** Vector, like the previous one except that
929	the elements for discarded lines have been squeezed out. */
930	private final int[] undiscarded;
931
932	/** Vector mapping virtual line numbers (not counting discarded lines)
933	to real ones (counting those lines). Both are origin-0. */
934	private final int[] realindexes;
935
936	/** Total number of nondiscarded lines. */
937	private int nondiscardedLines;
938
939	/** Array, indexed by real origin-1 line number,
940	containing true for a line that is an insertion or a deletion.
941	The results of comparison are stored here. */
942	private boolean[] changedFlag;
943	}
944	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: