1: #
   2: /*	diff - differential file comparison
   3: *
   4: *	Uses an algorithm due to Harold Stone, which finds
   5: *	a pair of longest identical subsequences in the two
   6: *	files.
   7: *
   8: *	The major goal is to generate the match vector J.
   9: *	J[i] is the index of the line in file1 corresponding
  10: *	to line i file0. J[i] = 0 if there is no
  11: *	such line in file1.
  12: *
  13: *	Lines are hashed so as to work in core. All potential
  14: *	matches are located by sorting the lines of each file
  15: *	on the hash (called value_____). In particular, this
  16: *	collects the equivalence classes in file1 together.
  17: *	Subroutine equiv____  replaces the value of each line in
  18: *	file0 by the index of the first element of its
  19: *	matching equivalence in (the reordered) file1.
  20: *	To save space equiv_____ squeezes file1 into a single
  21: *	array member______ in which the equivalence classes
  22: *	are simply concatenated, except that their first
  23: *	members are flagged by changing sign.
  24: *
  25: *	Next the indices that point into member______ are unsorted_______   into
  26: *	array class_____ according to the original order of file0.
  27: *
  28: *	The cleverness lies in routine stone______. This marches
  29: *	through the lines of file0, developing a vector klist_____
  30: *	of "k-candidates". At step i a k-candidate is a matched
  31: *	pair of lines x,y (x in file0 y in file1) such that
  32: *	there is a common subsequence of lenght k
  33: *	between the first i lines of file0 and the first y
  34: *	lines of file1, but there is no such subsequence for
  35: *	any smaller y. x is the earliest possible mate to y
  36: *	that occurs in such a subsequence.
  37: *
  38: *	Whenever any of the members of the equivalence class of
  39: *	lines in file1 matable to a line in file0 has serial number
  40: *	less than the y of some k-candidate, that k-candidate
  41: *	with the smallest such y is replaced. The new
  42: *	k-candidate is chained (via pred____) to the current
  43: *	k-1 candidate so that the actual subsequence can
  44: *	be recovered. When a member has serial number greater
  45: *	that the y of all k-candidates, the klist is extended.
  46: *	At the end, the longest subsequence is pulled out
  47: *	and placed in the array J by unravel_______.
  48: *
  49: *	With J in hand, the matches there recorded are
  50: *	check_____ed against reality to assure that no spurious
  51: *	matches have crept in due to hashing. If they have,
  52: *	they are broken, and "jackpot " is recorded--a harmless
  53: *	matter except that a true match for a spuriously
  54: *	mated line may now be unnecessarily reported as a change.
  55: *
  56: *	Much of the complexity of the program comes simply
  57: *	from trying to minimize core utilization and
  58: *	maximize the range of doable problems by dynamically
  59: *	allocating what is needed and reusing what is not.
  60: *	The core requirements for problems larger than somewhat
  61: *	are (in words) 2*length(file0) + length(file1) +
  62: *	3*(number of k-candidates installed),  typically about
  63: *	6n words for files of length n.
  64: */
  65: #include <stdio.h>
  66: #include <ctype.h>
  67: #include <sys/types.h>
  68: #include <sys/stat.h>
  69: #include <signal.h>
  70: #define prints(s)   fputs(s,stdout)
  71: 
  72: #define HALFLONG 16
  73: #define low(x)  (x&((1L<<HALFLONG)-1))
  74: #define high(x) (x>>HALFLONG)
  75: FILE *input[2];
  76: FILE *fopen();
  77: 
  78: struct cand {
  79:     int x;
  80:     int y;
  81:     int pred;
  82: } cand;
  83: struct line {
  84:     int serial;
  85:     int value;
  86: } *file[2], line;
  87: int len[2];
  88: struct line *sfile[2];  /*shortened by pruning common prefix and suffix*/
  89: int slen[2];
  90: int pref, suff; /*length of prefix and suffix*/
  91: int *class; /*will be overlaid on file[0]*/
  92: int *member;    /*will be overlaid on file[1]*/
  93: int *klist;     /*will be overlaid on file[0] after class*/
  94: struct cand *clist; /* merely a free storage pot for candidates */
  95: int clen = 0;
  96: int *J;     /*will be overlaid on class*/
  97: long *ixold;    /*will be overlaid on klist*/
  98: long *ixnew;    /*will be overlaid on file[1]*/
  99: int opt;    /* -1,0,1 = -e,normal,-f */
 100: int status = 2;
 101: int anychange = 0;
 102: char *empty = "";
 103: int bflag;
 104: 
 105: char *tempfile; /*used when comparing against std input*/
 106: char *mktemp();
 107: char *dummy;    /*used in resetting storage search ptr*/
 108: 
 109: done()
 110: {
 111:     unlink(tempfile);
 112:     exit(status);
 113: }
 114: 
 115: char *talloc(n)
 116: {
 117:     extern char *malloc();
 118:     register char *p;
 119:     p = malloc((unsigned)n);
 120:     if(p!=NULL)
 121:         return(p);
 122:     noroom();
 123: }
 124: 
 125: char *ralloc(p,n)   /*compacting reallocation */
 126: char *p;
 127: {
 128:     register char *q;
 129:     char *realloc();
 130:     free(p);
 131:     free(dummy);
 132:     dummy = malloc(1);
 133:     q = realloc(p, (unsigned)n);
 134:     if(q==NULL)
 135:         noroom();
 136:     return(q);
 137: }
 138: 
 139: noroom()
 140: {
 141:     mesg("files too big, try -h\n",empty);
 142:     done();
 143: }
 144: 
 145: sort(a,n)   /*shellsort CACM #201*/
 146: struct line *a;
 147: {
 148:     struct line w;
 149:     register int j,m;
 150:     struct line *ai;
 151:     register struct line *aim;
 152:     int k;
 153:     for(j=1;j<=n;j*= 2)
 154:         m = 2*j - 1;
 155:     for(m/=2;m!=0;m/=2) {
 156:         k = n-m;
 157:         for(j=1;j<=k;j++) {
 158:             for(ai = &a[j]; ai > a; ai -= m) {
 159:                 aim = &ai[m];
 160:                 if(aim < ai)
 161:                     break;  /*wraparound*/
 162:                 if(aim->value > ai[0].value ||
 163:                    aim->value == ai[0].value &&
 164:                    aim->serial > ai[0].serial)
 165:                     break;
 166:                 w.value = ai[0].value;
 167:                 ai[0].value = aim->value;
 168:                 aim->value = w.value;
 169:                 w.serial = ai[0].serial;
 170:                 ai[0].serial = aim->serial;
 171:                 aim->serial = w.serial;
 172:             }
 173:         }
 174:     }
 175: }
 176: 
 177: unsort(f, l, b)
 178: struct line *f;
 179: int *b;
 180: {
 181:     register int *a;
 182:     register int i;
 183:     a = (int *)talloc((l+1)*sizeof(int));
 184:     for(i=1;i<=l;i++)
 185:         a[f[i].serial] = f[i].value;
 186:     for(i=1;i<=l;i++)
 187:         b[i] = a[i];
 188:     free((char *)a);
 189: }
 190: 
 191: filename(pa1, pa2)
 192: char **pa1, **pa2;
 193: {
 194:     register char *a1, *b1, *a2;
 195:     char buf[512];
 196:     struct stat stbuf;
 197:     int i, f;
 198:     a1 = *pa1;
 199:     a2 = *pa2;
 200:     if(stat(a1,&stbuf)!=-1 && ((stbuf.st_mode&S_IFMT)==S_IFDIR)) {
 201:         b1 = *pa1 = malloc(100);
 202:         while(*b1++ = *a1++) ;
 203:         b1[-1] = '/';
 204:         a1 = b1;
 205:         while(*a1++ = *a2++)
 206:             if(*a2 && *a2!='/' && a2[-1]=='/')
 207:                 a1 = b1;
 208:     }
 209:     else if(a1[0]=='-'&&a1[1]==0&&tempfile==0) {
 210:         signal(SIGHUP,done);
 211:         signal(SIGINT,done);
 212:         signal(SIGPIPE,done);
 213:         signal(SIGTERM,done);
 214:         *pa1 = tempfile = mktemp("/tmp/dXXXXX");
 215:         if((f=creat(tempfile,0600)) < 0) {
 216:             mesg("cannot create ",tempfile);
 217:             done();
 218:         }
 219:         while((i=read(0,buf,512))>0)
 220:             write(f,buf,i);
 221:         close(f);
 222:     }
 223: }
 224: 
 225: prepare(i, arg)
 226: char *arg;
 227: {
 228:     register struct line *p;
 229:     register j,h;
 230:     if((input[i] = fopen(arg,"r")) == NULL){
 231:         mesg("cannot open ", arg);
 232:         done();
 233:     }
 234:     p = (struct line *)talloc(3*sizeof(line));
 235:     for(j=0; h=readhash(input[i]);) {
 236:         p = (struct line *)ralloc((char *)p,(++j+3)*sizeof(line));
 237:         p[j].value = h;
 238:     }
 239:     len[i] = j;
 240:     file[i] = p;
 241:     fclose(input[i]);
 242: }
 243: 
 244: prune()
 245: {
 246:     register i,j;
 247:     for(pref=0;pref<len[0]&&pref<len[1]&&
 248:         file[0][pref+1].value==file[1][pref+1].value;
 249:         pref++ ) ;
 250:     for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&&
 251:         file[0][len[0]-suff].value==file[1][len[1]-suff].value;
 252:         suff++) ;
 253:     for(j=0;j<2;j++) {
 254:         sfile[j] = file[j]+pref;
 255:         slen[j] = len[j]-pref-suff;
 256:         for(i=0;i<=slen[j];i++)
 257:             sfile[j][i].serial = i;
 258:     }
 259: }
 260: 
 261: equiv(a,n,b,m,c)
 262: struct line *a, *b;
 263: int *c;
 264: {
 265:     register int i, j;
 266:     i = j = 1;
 267:     while(i<=n && j<=m) {
 268:         if(a[i].value <b[j].value)
 269:             a[i++].value = 0;
 270:         else if(a[i].value == b[j].value)
 271:             a[i++].value = j;
 272:         else
 273:             j++;
 274:     }
 275:     while(i <= n)
 276:         a[i++].value = 0;
 277:     b[m+1].value = 0;
 278:     j = 0;
 279:     while(++j <= m) {
 280:         c[j] = -b[j].serial;
 281:         while(b[j+1].value == b[j].value) {
 282:             j++;
 283:             c[j] = b[j].serial;
 284:         }
 285:     }
 286:     c[j] = -1;
 287: }
 288: 
 289: main(argc, argv)
 290: char **argv;
 291: {
 292:     register int k;
 293:     char **args;
 294: 
 295:     args = argv;
 296:     if(argc>3 && *argv[1]=='-') {
 297:         argc--;
 298:         argv++;
 299:         for(k=1;argv[0][k];k++) {
 300:             switch(argv[0][k]) {
 301:             case 'e':
 302:                 opt = -1;
 303:                 break;
 304:             case 'f':
 305:                 opt = 1;
 306:                 break;
 307:             case 'b':
 308:                 bflag = 1;
 309:                 break;
 310:             case 'h':
 311:                 execv("/usr/lib/diffh",args);
 312:                 mesg("cannot find diffh",empty);
 313:                 done();
 314:             }
 315:         }
 316:     }
 317:     if(argc!=3) {
 318:         mesg("arg count",empty);
 319:         done();
 320:     }
 321:     dummy = malloc(1);
 322: 
 323:     filename(&argv[1], &argv[2]);
 324:     filename(&argv[2], &argv[1]);
 325:     prepare(0, argv[1]);
 326:     prepare(1, argv[2]);
 327:     prune();
 328:     sort(sfile[0],slen[0]);
 329:     sort(sfile[1],slen[1]);
 330: 
 331:     member = (int *)file[1];
 332:     equiv(sfile[0], slen[0], sfile[1], slen[1], member);
 333:     member = (int *)ralloc((char *)member,(slen[1]+2)*sizeof(int));
 334: 
 335:     class = (int *)file[0];
 336:     unsort(sfile[0], slen[0], class);
 337:     class = (int *)ralloc((char *)class,(slen[0]+2)*sizeof(int));
 338: 
 339:     klist = (int *)talloc((slen[0]+2)*sizeof(int));
 340:     clist = (struct cand *)talloc(sizeof(cand));
 341:     k = stone(class, slen[0], member, klist);
 342:     free((char *)member);
 343:     free((char *)class);
 344: 
 345:     J = (int *)talloc((len[0]+2)*sizeof(int));
 346:     unravel(klist[k]);
 347:     free((char *)clist);
 348:     free((char *)klist);
 349: 
 350:     ixold = (long *)talloc((len[0]+2)*sizeof(long));
 351:     ixnew = (long *)talloc((len[1]+2)*sizeof(long));
 352:     check(argv);
 353:     output(argv);
 354:     status = anychange;
 355:     done();
 356: }
 357: 
 358: stone(a,n,b,c)
 359: int *a;
 360: int *b;
 361: int *c;
 362: {
 363:     register int i, k,y;
 364:     int j, l;
 365:     int oldc, tc;
 366:     int oldl;
 367:     k = 0;
 368:     c[0] = newcand(0,0,0);
 369:     for(i=1; i<=n; i++) {
 370:         j = a[i];
 371:         if(j==0)
 372:             continue;
 373:         y = -b[j];
 374:         oldl = 0;
 375:         oldc = c[0];
 376:         do {
 377:             if(y <= clist[oldc].y)
 378:                 continue;
 379:             l = search(c, k, y);
 380:             if(l!=oldl+1)
 381:                 oldc = c[l-1];
 382:             if(l<=k) {
 383:                 if(clist[c[l]].y <= y)
 384:                     continue;
 385:                 tc = c[l];
 386:                 c[l] = newcand(i,y,oldc);
 387:                 oldc = tc;
 388:                 oldl = l;
 389:             } else {
 390:                 c[l] = newcand(i,y,oldc);
 391:                 k++;
 392:                 break;
 393:             }
 394:         } while((y=b[++j]) > 0);
 395:     }
 396:     return(k);
 397: }
 398: 
 399: newcand(x,y,pred)
 400: {
 401:     register struct cand *q;
 402:     clist = (struct cand *)ralloc((char *)clist,++clen*sizeof(cand));
 403:     q = clist + clen -1;
 404:     q->x = x;
 405:     q->y = y;
 406:     q->pred = pred;
 407:     return(clen-1);
 408: }
 409: 
 410: search(c, k, y)
 411: int *c;
 412: {
 413:     register int i, j, l;
 414:     int t;
 415:     if(clist[c[k]].y<y) /*quick look for typical case*/
 416:         return(k+1);
 417:     i = 0;
 418:     j = k+1;
 419:     while((l=(i+j)/2) > i) {
 420:         t = clist[c[l]].y;
 421:         if(t > y)
 422:             j = l;
 423:         else if(t < y)
 424:             i = l;
 425:         else
 426:             return(l);
 427:     }
 428:     return(l+1);
 429: }
 430: 
 431: unravel(p)
 432: {
 433:     register int i;
 434:     register struct cand *q;
 435:     for(i=0; i<=len[0]; i++)
 436:         J[i] =  i<=pref ? i:
 437:             i>len[0]-suff ? i+len[1]-len[0]:
 438:             0;
 439:     for(q=clist+p;q->y!=0;q=clist+q->pred)
 440:         J[q->x+pref] = q->y+pref;
 441: }
 442: 
 443: /* check does double duty:
 444: 1.  ferret out any fortuitous correspondences due
 445: to confounding by hashing (which result in "jackpot")
 446: 2.  collect random access indexes to the two files */
 447: 
 448: check(argv)
 449: char **argv;
 450: {
 451:     register int i, j;
 452:     int jackpot;
 453:     long ctold, ctnew;
 454:     char c,d;
 455:     input[0] = fopen(argv[1],"r");
 456:     input[1] = fopen(argv[2],"r");
 457:     j = 1;
 458:     ixold[0] = ixnew[0] = 0;
 459:     jackpot = 0;
 460:     ctold = ctnew = 0;
 461:     for(i=1;i<=len[0];i++) {
 462:         if(J[i]==0) {
 463:             ixold[i] = ctold += skipline(0);
 464:             continue;
 465:         }
 466:         while(j<J[i]) {
 467:             ixnew[j] = ctnew += skipline(1);
 468:             j++;
 469:         }
 470:         for(;;) {
 471:             c = getc(input[0]);
 472:             d = getc(input[1]);
 473:             ctold++;
 474:             ctnew++;
 475:             if(bflag && isspace(c) && isspace(d)) {
 476:                 do {
 477:                     if(c=='\n') break;
 478:                     ctold++;
 479:                 } while(isspace(c=getc(input[0])));
 480:                 do {
 481:                     if(d=='\n') break;
 482:                     ctnew++;
 483:                 } while(isspace(d=getc(input[1])));
 484:             }
 485:             if(c!=d) {
 486:                 jackpot++;
 487:                 J[i] = 0;
 488:                 if(c!='\n')
 489:                     ctold += skipline(0);
 490:                 if(d!='\n')
 491:                     ctnew += skipline(1);
 492:                 break;
 493:             }
 494:             if(c=='\n')
 495:                 break;
 496:         }
 497:         ixold[i] = ctold;
 498:         ixnew[j] = ctnew;
 499:         j++;
 500:     }
 501:     for(;j<=len[1];j++) {
 502:         ixnew[j] = ctnew += skipline(1);
 503:     }
 504:     fclose(input[0]);
 505:     fclose(input[1]);
 506: /*
 507: 	if(jackpot)
 508: 		mesg("jackpot",empty);
 509: */
 510: }
 511: 
 512: skipline(f)
 513: {
 514:     register i;
 515:     for(i=1;getc(input[f])!='\n';i++) ;
 516:     return(i);
 517: }
 518: 
 519: output(argv)
 520: char **argv;
 521: {
 522:     int m;
 523:     register int i0, i1, j1;
 524:     int j0;
 525:     input[0] = fopen(argv[1],"r");
 526:     input[1] = fopen(argv[2],"r");
 527:     m = len[0];
 528:     J[0] = 0;
 529:     J[m+1] = len[1]+1;
 530:     if(opt!=-1) for(i0=1;i0<=m;i0=i1+1) {
 531:         while(i0<=m&&J[i0]==J[i0-1]+1) i0++;
 532:         j0 = J[i0-1]+1;
 533:         i1 = i0-1;
 534:         while(i1<m&&J[i1+1]==0) i1++;
 535:         j1 = J[i1+1]-1;
 536:         J[i1] = j1;
 537:         change(i0,i1,j0,j1);
 538:     } else for(i0=m;i0>=1;i0=i1-1) {
 539:         while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--;
 540:         j0 = J[i0+1]-1;
 541:         i1 = i0+1;
 542:         while(i1>1&&J[i1-1]==0) i1--;
 543:         j1 = J[i1-1]+1;
 544:         J[i1] = j1;
 545:         change(i1,i0,j1,j0);
 546:     }
 547:     if(m==0)
 548:         change(1,0,1,len[1]);
 549: }
 550: 
 551: change(a,b,c,d)
 552: {
 553:     if(a>b&&c>d) return;
 554:     anychange = 1;
 555:     if(opt!=1) {
 556:         range(a,b,",");
 557:         putchar(a>b?'a':c>d?'d':'c');
 558:         if(opt!=-1) range(c,d,",");
 559:     } else {
 560:         putchar(a>b?'a':c>d?'d':'c');
 561:         range(a,b," ");
 562:     }
 563:     putchar('\n');
 564:     if(opt==0) {
 565:         fetch(ixold,a,b,input[0],"< ");
 566:         if(a<=b&&c<=d) prints("---\n");
 567:     }
 568:     fetch(ixnew,c,d,input[1],opt==0?"> ":empty);
 569:     if(opt!=0&&c<=d) prints(".\n");
 570: }
 571: 
 572: range(a,b,separator)
 573: char *separator;
 574: {
 575:     printf("%d", a>b?b:a);
 576:     if(a<b) {
 577:         printf("%s%d", separator, b);
 578:     }
 579: }
 580: 
 581: fetch(f,a,b,lb,s)
 582: long *f;
 583: FILE *lb;
 584: char *s;
 585: {
 586:     register int i, j;
 587:     register int nc;
 588:     for(i=a;i<=b;i++) {
 589:         fseek(lb,f[i-1],0);
 590:         nc = f[i]-f[i-1];
 591:         prints(s);
 592:         for(j=0;j<nc;j++)
 593:             putchar(getc(lb));
 594:     }
 595: }
 596: 
 597: /* hashing has the effect of
 598:  * arranging line in 7-bit bytes and then
 599:  * summing 1-s complement in 16-bit hunks
 600: */
 601: 
 602: readhash(f)
 603: FILE *f;
 604: {
 605:     long sum;
 606:     register unsigned shift;
 607:     register space;
 608:     register t;
 609:     sum = 1;
 610:     space = 0;
 611:     if(!bflag) for(shift=0;(t=getc(f))!='\n';shift+=7) {
 612:         if(t==-1)
 613:             return(0);
 614:         sum += (long)t << (shift%=HALFLONG);
 615:     }
 616:     else for(shift=0;;) {
 617:         switch(t=getc(f)) {
 618:         case -1:
 619:             return(0);
 620:         case '\t':
 621:         case ' ':
 622:             space++;
 623:             continue;
 624:         default:
 625:             if(space) {
 626:                 shift += 7;
 627:                 space = 0;
 628:             }
 629:             sum += (long)t << (shift%=HALFLONG);
 630:             shift += 7;
 631:             continue;
 632:         case '\n':
 633:             break;
 634:         }
 635:         break;
 636:     }
 637:     sum = low(sum) + high(sum);
 638:     return((short)low(sum) + (short)high(sum));
 639: }
 640: 
 641: mesg(s,t)
 642: char *s, *t;
 643: {
 644:     fprintf(stderr,"diff: %s%s\n",s,t);
 645: }

Defined functions

change defined in line 551; used 3 times
check defined in line 448; used 1 times
done defined in line 109; used 10 times
equiv defined in line 261; used 1 times
fetch defined in line 581; used 2 times
filename defined in line 191; used 2 times
main defined in line 289; never used
mesg defined in line 641; used 5 times
newcand defined in line 399; used 3 times
noroom defined in line 139; used 2 times
output defined in line 519; used 1 times
prepare defined in line 225; used 2 times
prune defined in line 244; used 1 times
ralloc defined in line 125; used 4 times
range defined in line 572; used 3 times
readhash defined in line 602; used 1 times
search defined in line 410; used 1 times
skipline defined in line 512; used 5 times
sort defined in line 145; used 2 times
stone defined in line 358; used 1 times
talloc defined in line 115; used 7 times
unravel defined in line 431; used 1 times
unsort defined in line 177; used 1 times

Defined variables

J defined in line 96; used 21 times
anychange defined in line 101; used 2 times
bflag defined in line 103; used 3 times
cand defined in line 82; used 2 times
class defined in line 91; used 6 times
clen defined in line 95; used 3 times
clist defined in line 94; used 11 times
dummy defined in line 107; used 3 times
empty defined in line 102; used 4 times
file defined in line 86; used 8 times
ixnew defined in line 98; used 6 times
ixold defined in line 97; used 5 times
klist defined in line 93; used 4 times
len defined in line 87; used 20 times
line defined in line 86; used 2 times
member defined in line 92; used 6 times
opt defined in line 99; used 8 times
pref defined in line 90; used 13 times
sfile defined in line 88; used 7 times
slen defined in line 89; used 11 times
status defined in line 100; used 2 times
suff defined in line 90; used 8 times
tempfile defined in line 105; used 5 times

Defined struct's

cand defined in line 78; used 10 times
line defined in line 83; used 20 times

Defined macros

HALFLONG defined in line 72; used 4 times
high defined in line 74; used 2 times
low defined in line 73; used 2 times
prints defined in line 70; used 3 times
Last modified: 1979-01-10
Generated: 2016-12-26
Generated by src2html V0.67
page hit count: 1607
Valid CSS Valid XHTML 1.0 Strict