1: /*	diff - differential file comparison
   2: *
   3: *	Uses an algorithm due to Harold Stone, which finds
   4: *	a pair of longest identical subsequences in the two
   5: *	files.
   6: *
   7: *	The major goal is to generate the match vector J.
   8: *	J[i] is the index of the line in file1 corresponding
   9: *	to line i file0. J[i] = 0 if there is no
  10: *	such line in file1.
  11: *
  12: *	Lines are hashed so as to work in core. All potential
  13: *	matches are located by sorting the lines of each file
  14: *	on the hash (called value_____). In particular, this
  15: *	collects the equivalence classes in file1 together.
  16: *	Subroutine equiv____  replaces the value of each line in
  17: *	file0 by the index of the first element of its
  18: *	matching equivalence in (the reordered) file1.
  19: *	To save space equiv_____ squeezes file1 into a single
  20: *	array member______ in which the equivalence classes
  21: *	are simply concatenated, except that their first
  22: *	members are flagged by changing sign.
  23: *
  24: *	Next the indices that point into member______ are unsorted_______   into
  25: *	array class_____ according to the original order of file0.
  26: *
  27: *	The cleverness lies in routine stone______. This marches
  28: *	through the lines of file0, developing a vector klist_____
  29: *	of "k-candidates". At step i a k-candidate is a matched
  30: *	pair of lines x,y (x in file0 y in file1) such that
  31: *	there is a common subsequence of lenght k
  32: *	between the first i lines of file0 and the first y
  33: *	lines of file1, but there is no such subsequence for
  34: *	any smaller y. x is the earliest possible mate to y
  35: *	that occurs in such a subsequence.
  36: *
  37: *	Whenever any of the members of the equivalence class of
  38: *	lines in file1 matable to a line in file0 has serial number
  39: *	less than the y of some k-candidate, that k-candidate
  40: *	with the smallest such y is replaced. The new
  41: *	k-candidate is chained (via pred____) to the current
  42: *	k-1 candidate so that the actual subsequence can
  43: *	be recovered. When a member has serial number greater
  44: *	that the y of all k-candidates, the klist is extended.
  45: *	At the end, the longest subsequence is pulled out
  46: *	and placed in the array J by unravel_______.
  47: *
  48: *	With J in hand, the matches there recorded are
  49: *	check_____ed against reality to assure that no spurious
  50: *	matches have crept in due to hashing. If they have,
  51: *	they are broken, and "jackpot " is recorded--a harmless
  52: *	matter except that a true match for a spuriously
  53: *	mated line may now be unnecessarily reported as a change.
  54: *
  55: *	Much of the complexity of the program comes simply
  56: *	from trying to minimize core utilization and
  57: *	maximize the range of doable problems by dynamically
  58: *	allocating what is needed and reusing what is not.
  59: *	The core requirements for problems larger than somewhat
  60: *	are (in words) 2*length(file0) + length(file1) +
  61: *	3*(number of k-candidates installed),  typically about
  62: *	6n words for files of length n. There is also space for buf1
  63: *	used which could, by moving data underfoot and reallocating
  64: *	buf1 together with buf2, be completely overlaid.
  65: */
  66: struct buf {
  67:     int fdes;
  68:     char data[516];
  69: } *buf1, *buf2;
  70: 
  71: struct cand {
  72:     int x;
  73:     int y;
  74:     struct cand *pred;
  75: } cand;
  76: struct line {
  77:     int serial;
  78:     int value;
  79: } *file[2], line;
  80: int len[2];
  81: int *class; /*will be overlaid on file[0]*/
  82: int *member;    /*will be overlaid on file[1]*/
  83: struct cand **klist;    /*will be overlaid on file[0] after class*/
  84: int *J;     /*will be overlaid on class*/
  85: int *ixold; /*will be overlaid on klist*/
  86: int *ixnew; /*will be overlaid on file[1]*/
  87: 
  88: char *area;
  89: char *top;
  90: alloc(n)
  91: {
  92:     register char *p;
  93:     p = area;
  94:     n = (n+1) & ~1;
  95:     area =+ n;
  96:     while(area > top) {
  97:         if(sbrk(1024) == -1) {
  98:             mesg("Out of space\n");
  99:             exit(1);
 100:         }
 101:         top =+ 1024;
 102:     }
 103:     return(p);
 104: }
 105: 
 106: mesg(s)
 107: char *s;
 108: {
 109:     while(*s)
 110:         write(2,s++,1);
 111: }
 112: 
 113: sort(a,n)   /*shellsort CACM #201*/
 114: struct line *a;
 115: {
 116:     struct line w;
 117:     register int j,m;
 118:     struct line *ai;
 119:     register struct line *aim;
 120:     int k;
 121:     for(j=1;j<=n;j=* 2)
 122:         m = 2*j - 1;
 123:     for(m=/2;m!=0;m=/2) {
 124:         k = n-m;
 125:         for(j=1;j<=k;j++) {
 126:             for(ai = &a[j]; ai > a; ai =- m) {
 127:                 aim = &ai[m];
 128:                 if(aim->value > ai[0].value ||
 129:                    aim->value == ai[0].value &&
 130:                    aim->serial > ai[0].serial)
 131:                     break;
 132:                 w.value = ai[0].value;
 133:                 ai[0].value = aim->value;
 134:                 aim->value = w.value;
 135:                 w.serial = ai[0].serial;
 136:                 ai[0].serial = aim->serial;
 137:                 aim->serial = w.serial;
 138:             }
 139:         }
 140:     }
 141: }
 142: 
 143: unsort(f, l, b)
 144: struct line *f;
 145: int *b;
 146: {
 147:     int *a;
 148:     int i;
 149:     a = alloc((l+1)*sizeof(a[0]));
 150:     for(i=1;i<=l;i++)
 151:         a[f[i].serial] = f[i].value;
 152:     for(i=1;i<=l;i++)
 153:         b[i] = a[i];
 154:     area = a;
 155: }
 156: 
 157: prepare(i, arg)
 158: char *arg;
 159: {
 160:     register char *temp;
 161:     temp = file[i] = area;
 162:     alloc(sizeof(line));
 163:     input(arg);
 164:     len[i] = (area - temp)/sizeof(line) - 1;
 165:     alloc(sizeof(line));
 166:     sort(file[i], len[i]);
 167: }
 168: 
 169: input(arg)
 170: {
 171:     register int h, i;
 172:     register struct line *p;
 173:     if(fopen(arg,buf1) == -1) {
 174:         mesg("Cannot open ");
 175:         mesg(arg);
 176:         mesg("\n");
 177:         exit(1);
 178:     }
 179:     for(i=0; h=readhash(buf1);) {
 180:         p = alloc(sizeof(line));
 181:         p->serial = ++i;
 182:         p->value = h;
 183:     }
 184:     close(buf1->fdes);
 185: }
 186: 
 187: equiv(a,n,b,m,c)
 188: struct line *a, *b;
 189: int *c;
 190: {
 191:     register int i, j;
 192:     i = j = 1;
 193:     while(i<=n && j<=m) {
 194:         if(a[i].value <b[j].value)
 195:             a[i++].value = 0;
 196:         else if(a[i].value == b[j].value)
 197:             a[i++].value = j;
 198:         else
 199:             j++;
 200:     }
 201:     while(i <= n)
 202:         a[i++].value = 0;
 203:     b[m+1].value = 0;
 204:     j = 0;
 205:     while(++j <= m) {
 206:         c[j] = -b[j].serial;
 207:         while(b[j+1].value == b[j].value) {
 208:             j++;
 209:             c[j] = b[j].serial;
 210:         }
 211:     }
 212:     c[j] = -1;
 213: }
 214: 
 215: main(argc, argv)
 216: char **argv;
 217: {
 218:     int k;
 219: 
 220:     if(argc>1 && *argv[1]=='-') {
 221:         argc--;
 222:         argv++;
 223:     }
 224:     if(argc!=3) {
 225:         mesg("Arg count\n");
 226:         exit(1);
 227:     }
 228: 
 229:     area = top = sbrk(0);
 230:     buf1 = alloc(sizeof(*buf1));
 231:     prepare(0, argv[1]);
 232:     prepare(1, argv[2]);
 233: 
 234:     member = file[1];
 235:     equiv(file[0], len[0], file[1], len[1], member);
 236: 
 237:     class = file[0];
 238:     unsort(file[0], len[0], class);
 239: 
 240:     klist = &class[len[0]+2];
 241:     area = &member[len[1]+2];
 242:     k = stone(class, len[0], member, klist);
 243:     J = class;
 244:     unravel(klist[k]);
 245: 
 246:     ixold = klist;
 247:     ixnew = file[1];
 248:     area = &ixnew[len[1]+2];
 249:     buf2 = alloc(sizeof(*buf2));
 250:     if(check(argv))
 251:         mesg("Jackpot\n");
 252:     output(argv);
 253: }
 254: 
 255: stone(a,n,b,c)
 256: int *a;
 257: int *b;
 258: struct cand **c;
 259: {
 260:     register int i, k,y;
 261:     int j, l;
 262:     int skip;
 263:     k = 0;
 264:     c[0] = 0;
 265:     for(i=1; i<=n; i++) {
 266:         j = a[i];
 267:         if(j==0)
 268:             continue;
 269:         skip = 0;
 270:         do {
 271:             y = b[j];
 272:             if(y<0) y = -y;
 273:             if(skip)
 274:                 continue;
 275:             l = search(c, k, y);
 276:             if(l > k) {
 277:                 c[k+1] = newcand(i,y,c[k]);
 278:                 skip = 1;
 279:                 k++;
 280:             }
 281:             else if(c[l]->y > y && c[l]->x < i)
 282:                 c[l] = newcand(i,y,c[l-1]);
 283:         } while(b[++j] > 0);
 284:     }
 285:     return(k);
 286: }
 287: 
 288: struct cand *
 289: newcand(x,y,pred)
 290: struct cand *pred;
 291: {
 292:     struct cand *p;
 293:     p = alloc(sizeof(cand));
 294:     p->x = x;
 295:     p->y = y;
 296:     p->pred = pred;
 297:     return(p);
 298: }
 299: 
 300: search(c, k, y)
 301: struct cand **c;
 302: {
 303:     register int i, j, l;
 304:     int t;
 305:     i = 0;
 306:     j = k+1;
 307:     while((l=(i+j)/2) > i) {
 308:         t = c[l]->y;
 309:         if(t > y)
 310:             j = l;
 311:         else if(t < y)
 312:             i = l;
 313:         else
 314:             return(l);
 315:     }
 316:     return(l+1);
 317: }
 318: 
 319: unravel(p)
 320: struct cand *p;
 321: {
 322:     int i;
 323:     for(i=0; i<=len[0]; i++)
 324:         J[i] = 0;
 325:     while(p) {
 326:         J[p->x] = p->y;
 327:         p = p->pred;
 328:     }
 329: }
 330: 
 331: /* check does double duty:
 332: 1.  ferret out any fortuitous correspondences due
 333: to counfounding by hashing (which result in "jackpot")
 334: 2.  collect random access indexes to the two files */
 335: 
 336: check(argv)
 337: char **argv;
 338: {
 339:     register int i, j;
 340:     int ctold, ctnew;
 341:     int jackpot;
 342:     char c,d;
 343:     fopen(argv[1],buf1);
 344:     fopen(argv[2],buf2);
 345:     j = 1;
 346:     ctold = ctnew = 0;
 347:     ixold[0] = ixnew[0] = 0;
 348:     jackpot = 0;
 349:     for(i=1;i<=len[0];i++) {
 350:         if(J[i]==0) {
 351:             while(getc(buf1)!='\n') ctold++;
 352:             ixold[i] = ++ctold;
 353:             continue;
 354:         }
 355:         while(j<J[i]) {
 356:             while(getc(buf2)!='\n') ctnew++;
 357:             ixnew[j] = ++ctnew;
 358:             j++;
 359:         }
 360:         while((c=getc(buf1))==(d=getc(buf2))) {
 361:             if(c=='\n') break;
 362:             ctold++;
 363:             ctnew++;
 364:         }
 365:         while(c!='\n') {
 366:             jackpot++;
 367:             J[i] = 0;
 368:             c = getc(buf1);
 369:             ctold++;
 370:         }
 371:         ixold[i] = ++ctold;
 372:         while(d!='\n') {
 373:             jackpot++;
 374:             J[i] = 0;
 375:             d = getc(buf2);
 376:             ctnew++;
 377:         }
 378:         ixnew[j] = ++ctnew;
 379:         j++;
 380:     }
 381:     for(;j<=len[1];j++) {
 382:         while(getc(buf2)!='\n') ctnew++;
 383:         ixnew[j] = ++ctnew;
 384:     }
 385:     close(buf1->fdes);
 386:     close(buf2->fdes);
 387:     return(jackpot);
 388: }
 389: 
 390: output(argv)
 391: char **argv;
 392: {
 393:     int dir;
 394:     int m;
 395:     int i0,i1,j0,j1;
 396:     extern fout;
 397:     dir = **argv=='-';
 398:     fout = dup(1);
 399:     buf1->fdes = open(argv[1],0);
 400:     buf2->fdes = open(argv[2],0);
 401:     m = len[0];
 402:     J[0] = 0;
 403:     J[m+1] = len[1]+1;
 404:     if(dir==0) for(i0=1;i0<=m;i0=i1+1) {
 405:         while(i0<=m&&J[i0]==J[i0-1]+1) i0++;
 406:         j0 = J[i0-1]+1;
 407:         i1 = i0-1;
 408:         while(i1<m&&J[i1+1]==0) i1++;
 409:         j1 = J[i1+1]-1;
 410:         J[i1] = j1;
 411:         change(i0,i1,j0,j1,dir);
 412:     } else for(i0=m;i0>=1;i0=i1-1) {
 413:         while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--;
 414:         j0 = J[i0+1]-1;
 415:         i1 = i0+1;
 416:         while(i1>1&&J[i1-1]==0) i1--;
 417:         j1 = J[i1-1]+1;
 418:         J[i1] = j1;
 419:         change(i1,i0,j1,j0,dir);
 420:     }
 421:     if(m==0)
 422:         change(1,0,1,len[1],dir);
 423:     flush();
 424: }
 425: 
 426: change(a,b,c,d,dir)
 427: {
 428:     if(a>b&&c>d) return;
 429:     range(a,b);
 430:     putchar(a>b?'a':c>d?'d':'c');
 431:     if(dir==0) range(c,d);
 432:     putchar('\n');
 433:     if(dir==0) {
 434:         fetch(ixold,a,b,buf1,"* ");
 435:         if(a<=b&&c<=d) printf("---\n");
 436:     }
 437:     fetch(ixnew,c,d,buf2,dir==0?". ":"");
 438:     if(dir!=0&&c<=d) printf(".\n");
 439: }
 440: 
 441: range(a,b)
 442: {
 443:     if(a>b) printf("%d",b);
 444:     if(a<=b) printf("%d",a);
 445:     if(a<b) printf(",%d",b);
 446: }
 447: 
 448: fetch(f,a,b,lb,pref)
 449: int *f;
 450: struct buf *lb;
 451: char *pref;
 452: {
 453:     int i, j;
 454:     int nc;
 455:     for(i=a;i<=b;i++) {
 456:         seek(lb->fdes,f[i-1],0);
 457:         nc = read(lb->fdes,lb->data,f[i]-f[i-1]);
 458:         printf(pref);
 459:         for(j=0;j<nc;j++)
 460:             putchar(lb->data[j]);
 461:     }
 462: }

Defined functions

alloc defined in line 90; used 7 times
change defined in line 426; used 3 times
check defined in line 336; used 1 times
equiv defined in line 187; used 1 times
fetch defined in line 448; used 2 times
input defined in line 169; used 1 times
main defined in line 215; never used
mesg defined in line 106; used 6 times
newcand defined in line 288; used 2 times
output defined in line 390; used 1 times
prepare defined in line 157; used 2 times
range defined in line 441; used 2 times
search defined in line 300; used 1 times
sort defined in line 113; used 1 times
stone defined in line 255; used 1 times
unravel defined in line 319; used 1 times
unsort defined in line 143; used 1 times

Defined variables

J defined in line 84; used 22 times
area defined in line 88; used 9 times
buf1 defined in line 69; used 12 times
buf2 defined in line 69; used 10 times
cand defined in line 75; used 1 times
class defined in line 81; used 5 times
file defined in line 79; used 8 times
ixnew defined in line 86; used 7 times
ixold defined in line 85; used 5 times
klist defined in line 83; used 4 times
len defined in line 80; used 15 times
line defined in line 79; used 4 times
member defined in line 82; used 4 times
top defined in line 89; used 3 times

Defined struct's

buf defined in line 66; used 2 times
  • in line 450(2)
cand defined in line 71; used 16 times
line defined in line 76; used 14 times
Last modified: 1975-05-14
Generated: 2016-12-26
Generated by src2html V0.67
page hit count: 1450
Valid CSS Valid XHTML 1.0 Strict