1: # 2: /* diff - differential file comparison 3: * 4: * Uses an algorithm due to Harold Stone, which finds 5: * a pair of longest identical subsequences in the two 6: * files. 7: * 8: * The major goal is to generate the match vector J. 9: * J[i] is the index of the line in file1 corresponding 10: * to line i file0. J[i] = 0 if there is no 11: * such line in file1. 12: * 13: * Lines are hashed so as to work in core. All potential 14: * matches are located by sorting the lines of each file 15: * on the hash (called value_____). In particular, this 16: * collects the equivalence classes in file1 together. 17: * Subroutine equiv____ replaces the value of each line in 18: * file0 by the index of the first element of its 19: * matching equivalence in (the reordered) file1. 20: * To save space equiv_____ squeezes file1 into a single 21: * array member______ in which the equivalence classes 22: * are simply concatenated, except that their first 23: * members are flagged by changing sign. 24: * 25: * Next the indices that point into member______ are unsorted_______ into 26: * array class_____ according to the original order of file0. 27: * 28: * The cleverness lies in routine stone______. This marches 29: * through the lines of file0, developing a vector klist_____ 30: * of "k-candidates". At step i a k-candidate is a matched 31: * pair of lines x,y (x in file0 y in file1) such that 32: * there is a common subsequence of lenght k 33: * between the first i lines of file0 and the first y 34: * lines of file1, but there is no such subsequence for 35: * any smaller y. x is the earliest possible mate to y 36: * that occurs in such a subsequence. 37: * 38: * Whenever any of the members of the equivalence class of 39: * lines in file1 matable to a line in file0 has serial number 40: * less than the y of some k-candidate, that k-candidate 41: * with the smallest such y is replaced. The new 42: * k-candidate is chained (via pred____) to the current 43: * k-1 candidate so that the actual subsequence can 44: * be recovered. When a member has serial number greater 45: * that the y of all k-candidates, the klist is extended. 46: * At the end, the longest subsequence is pulled out 47: * and placed in the array J by unravel_______. 48: * 49: * With J in hand, the matches there recorded are 50: * check_____ed against reality to assure that no spurious 51: * matches have crept in due to hashing. If they have, 52: * they are broken, and "jackpot " is recorded--a harmless 53: * matter except that a true match for a spuriously 54: * mated line may now be unnecessarily reported as a change. 55: * 56: * Much of the complexity of the program comes simply 57: * from trying to minimize core utilization and 58: * maximize the range of doable problems by dynamically 59: * allocating what is needed and reusing what is not. 60: * The core requirements for problems larger than somewhat 61: * are (in words) 2*length(file0) + length(file1) + 62: * 3*(number of k-candidates installed), typically about 63: * 6n words for files of length n. 64: */ 65: #include <stdio.h> 66: #include <ctype.h> 67: #include <sys/types.h> 68: #include <sys/stat.h> 69: #include <signal.h> 70: #define prints(s) fputs(s,stdout) 71: 72: #define HALFLONG 16 73: #define low(x) (x&((1L<<HALFLONG)-1)) 74: #define high(x) (x>>HALFLONG) 75: FILE *input[2]; 76: FILE *fopen(); 77: 78: struct cand { 79: int x; 80: int y; 81: int pred; 82: } cand; 83: struct line { 84: int serial; 85: int value; 86: } *file[2], line; 87: int len[2]; 88: struct line *sfile[2]; /*shortened by pruning common prefix and suffix*/ 89: int slen[2]; 90: int pref, suff; /*length of prefix and suffix*/ 91: int *class; /*will be overlaid on file[0]*/ 92: int *member; /*will be overlaid on file[1]*/ 93: int *klist; /*will be overlaid on file[0] after class*/ 94: struct cand *clist; /* merely a free storage pot for candidates */ 95: int clen = 0; 96: int *J; /*will be overlaid on class*/ 97: long *ixold; /*will be overlaid on klist*/ 98: long *ixnew; /*will be overlaid on file[1]*/ 99: int opt; /* -1,0,1 = -e,normal,-f */ 100: int status = 2; 101: int anychange = 0; 102: char *empty = ""; 103: int bflag; 104: 105: char *tempfile; /*used when comparing against std input*/ 106: char *mktemp(); 107: char *dummy; /*used in resetting storage search ptr*/ 108: 109: done() 110: { 111: unlink(tempfile); 112: exit(status); 113: } 114: 115: char *talloc(n) 116: { 117: extern char *malloc(); 118: register char *p; 119: p = malloc((unsigned)n); 120: if(p!=NULL) 121: return(p); 122: noroom(); 123: } 124: 125: char *ralloc(p,n) /*compacting reallocation */ 126: char *p; 127: { 128: register char *q; 129: char *realloc(); 130: free(p); 131: free(dummy); 132: dummy = malloc(1); 133: q = realloc(p, (unsigned)n); 134: if(q==NULL) 135: noroom(); 136: return(q); 137: } 138: 139: noroom() 140: { 141: mesg("files too big, try -h\n",empty); 142: done(); 143: } 144: 145: sort(a,n) /*shellsort CACM #201*/ 146: struct line *a; 147: { 148: struct line w; 149: register int j,m; 150: struct line *ai; 151: register struct line *aim; 152: int k; 153: for(j=1;j<=n;j*= 2) 154: m = 2*j - 1; 155: for(m/=2;m!=0;m/=2) { 156: k = n-m; 157: for(j=1;j<=k;j++) { 158: for(ai = &a[j]; ai > a; ai -= m) { 159: aim = &ai[m]; 160: if(aim < ai) 161: break; /*wraparound*/ 162: if(aim->value > ai[0].value || 163: aim->value == ai[0].value && 164: aim->serial > ai[0].serial) 165: break; 166: w.value = ai[0].value; 167: ai[0].value = aim->value; 168: aim->value = w.value; 169: w.serial = ai[0].serial; 170: ai[0].serial = aim->serial; 171: aim->serial = w.serial; 172: } 173: } 174: } 175: } 176: 177: unsort(f, l, b) 178: struct line *f; 179: int *b; 180: { 181: register int *a; 182: register int i; 183: a = (int *)talloc((l+1)*sizeof(int)); 184: for(i=1;i<=l;i++) 185: a[f[i].serial] = f[i].value; 186: for(i=1;i<=l;i++) 187: b[i] = a[i]; 188: free((char *)a); 189: } 190: 191: filename(pa1, pa2) 192: char **pa1, **pa2; 193: { 194: register char *a1, *b1, *a2; 195: char buf[512]; 196: struct stat stbuf; 197: int i, f; 198: a1 = *pa1; 199: a2 = *pa2; 200: if(stat(a1,&stbuf)!=-1 && ((stbuf.st_mode&S_IFMT)==S_IFDIR)) { 201: b1 = *pa1 = malloc(100); 202: while(*b1++ = *a1++) ; 203: b1[-1] = '/'; 204: a1 = b1; 205: while(*a1++ = *a2++) 206: if(*a2 && *a2!='/' && a2[-1]=='/') 207: a1 = b1; 208: } 209: else if(a1[0]=='-'&&a1[1]==0&&tempfile==0) { 210: signal(SIGHUP,done); 211: signal(SIGINT,done); 212: signal(SIGPIPE,done); 213: signal(SIGTERM,done); 214: *pa1 = tempfile = mktemp("/tmp/dXXXXX"); 215: if((f=creat(tempfile,0600)) < 0) { 216: mesg("cannot create ",tempfile); 217: done(); 218: } 219: while((i=read(0,buf,512))>0) 220: write(f,buf,i); 221: close(f); 222: } 223: } 224: 225: prepare(i, arg) 226: char *arg; 227: { 228: register struct line *p; 229: register j,h; 230: if((input[i] = fopen(arg,"r")) == NULL){ 231: mesg("cannot open ", arg); 232: done(); 233: } 234: p = (struct line *)talloc(3*sizeof(line)); 235: for(j=0; h=readhash(input[i]);) { 236: p = (struct line *)ralloc((char *)p,(++j+3)*sizeof(line)); 237: p[j].value = h; 238: } 239: len[i] = j; 240: file[i] = p; 241: fclose(input[i]); 242: } 243: 244: prune() 245: { 246: register i,j; 247: for(pref=0;pref<len[0]&&pref<len[1]&& 248: file[0][pref+1].value==file[1][pref+1].value; 249: pref++ ) ; 250: for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&& 251: file[0][len[0]-suff].value==file[1][len[1]-suff].value; 252: suff++) ; 253: for(j=0;j<2;j++) { 254: sfile[j] = file[j]+pref; 255: slen[j] = len[j]-pref-suff; 256: for(i=0;i<=slen[j];i++) 257: sfile[j][i].serial = i; 258: } 259: } 260: 261: equiv(a,n,b,m,c) 262: struct line *a, *b; 263: int *c; 264: { 265: register int i, j; 266: i = j = 1; 267: while(i<=n && j<=m) { 268: if(a[i].value <b[j].value) 269: a[i++].value = 0; 270: else if(a[i].value == b[j].value) 271: a[i++].value = j; 272: else 273: j++; 274: } 275: while(i <= n) 276: a[i++].value = 0; 277: b[m+1].value = 0; 278: j = 0; 279: while(++j <= m) { 280: c[j] = -b[j].serial; 281: while(b[j+1].value == b[j].value) { 282: j++; 283: c[j] = b[j].serial; 284: } 285: } 286: c[j] = -1; 287: } 288: 289: main(argc, argv) 290: char **argv; 291: { 292: register int k; 293: char **args; 294: 295: args = argv; 296: if(argc>3 && *argv[1]=='-') { 297: argc--; 298: argv++; 299: for(k=1;argv[0][k];k++) { 300: switch(argv[0][k]) { 301: case 'e': 302: opt = -1; 303: break; 304: case 'f': 305: opt = 1; 306: break; 307: case 'b': 308: bflag = 1; 309: break; 310: case 'h': 311: execv("/usr/lib/diffh",args); 312: mesg("cannot find diffh",empty); 313: done(); 314: } 315: } 316: } 317: if(argc!=3) { 318: mesg("arg count",empty); 319: done(); 320: } 321: dummy = malloc(1); 322: 323: filename(&argv[1], &argv[2]); 324: filename(&argv[2], &argv[1]); 325: prepare(0, argv[1]); 326: prepare(1, argv[2]); 327: prune(); 328: sort(sfile[0],slen[0]); 329: sort(sfile[1],slen[1]); 330: 331: member = (int *)file[1]; 332: equiv(sfile[0], slen[0], sfile[1], slen[1], member); 333: member = (int *)ralloc((char *)member,(slen[1]+2)*sizeof(int)); 334: 335: class = (int *)file[0]; 336: unsort(sfile[0], slen[0], class); 337: class = (int *)ralloc((char *)class,(slen[0]+2)*sizeof(int)); 338: 339: klist = (int *)talloc((slen[0]+2)*sizeof(int)); 340: clist = (struct cand *)talloc(sizeof(cand)); 341: k = stone(class, slen[0], member, klist); 342: free((char *)member); 343: free((char *)class); 344: 345: J = (int *)talloc((len[0]+2)*sizeof(int)); 346: unravel(klist[k]); 347: free((char *)clist); 348: free((char *)klist); 349: 350: ixold = (long *)talloc((len[0]+2)*sizeof(long)); 351: ixnew = (long *)talloc((len[1]+2)*sizeof(long)); 352: check(argv); 353: output(argv); 354: status = anychange; 355: done(); 356: } 357: 358: stone(a,n,b,c) 359: int *a; 360: int *b; 361: int *c; 362: { 363: register int i, k,y; 364: int j, l; 365: int oldc, tc; 366: int oldl; 367: k = 0; 368: c[0] = newcand(0,0,0); 369: for(i=1; i<=n; i++) { 370: j = a[i]; 371: if(j==0) 372: continue; 373: y = -b[j]; 374: oldl = 0; 375: oldc = c[0]; 376: do { 377: if(y <= clist[oldc].y) 378: continue; 379: l = search(c, k, y); 380: if(l!=oldl+1) 381: oldc = c[l-1]; 382: if(l<=k) { 383: if(clist[c[l]].y <= y) 384: continue; 385: tc = c[l]; 386: c[l] = newcand(i,y,oldc); 387: oldc = tc; 388: oldl = l; 389: } else { 390: c[l] = newcand(i,y,oldc); 391: k++; 392: break; 393: } 394: } while((y=b[++j]) > 0); 395: } 396: return(k); 397: } 398: 399: newcand(x,y,pred) 400: { 401: register struct cand *q; 402: clist = (struct cand *)ralloc((char *)clist,++clen*sizeof(cand)); 403: q = clist + clen -1; 404: q->x = x; 405: q->y = y; 406: q->pred = pred; 407: return(clen-1); 408: } 409: 410: search(c, k, y) 411: int *c; 412: { 413: register int i, j, l; 414: int t; 415: if(clist[c[k]].y<y) /*quick look for typical case*/ 416: return(k+1); 417: i = 0; 418: j = k+1; 419: while((l=(i+j)/2) > i) { 420: t = clist[c[l]].y; 421: if(t > y) 422: j = l; 423: else if(t < y) 424: i = l; 425: else 426: return(l); 427: } 428: return(l+1); 429: } 430: 431: unravel(p) 432: { 433: register int i; 434: register struct cand *q; 435: for(i=0; i<=len[0]; i++) 436: J[i] = i<=pref ? i: 437: i>len[0]-suff ? i+len[1]-len[0]: 438: 0; 439: for(q=clist+p;q->y!=0;q=clist+q->pred) 440: J[q->x+pref] = q->y+pref; 441: } 442: 443: /* check does double duty: 444: 1. ferret out any fortuitous correspondences due 445: to confounding by hashing (which result in "jackpot") 446: 2. collect random access indexes to the two files */ 447: 448: check(argv) 449: char **argv; 450: { 451: register int i, j; 452: int jackpot; 453: long ctold, ctnew; 454: char c,d; 455: input[0] = fopen(argv[1],"r"); 456: input[1] = fopen(argv[2],"r"); 457: j = 1; 458: ixold[0] = ixnew[0] = 0; 459: jackpot = 0; 460: ctold = ctnew = 0; 461: for(i=1;i<=len[0];i++) { 462: if(J[i]==0) { 463: ixold[i] = ctold += skipline(0); 464: continue; 465: } 466: while(j<J[i]) { 467: ixnew[j] = ctnew += skipline(1); 468: j++; 469: } 470: for(;;) { 471: c = getc(input[0]); 472: d = getc(input[1]); 473: ctold++; 474: ctnew++; 475: if(bflag && isspace(c) && isspace(d)) { 476: do { 477: if(c=='\n') break; 478: ctold++; 479: } while(isspace(c=getc(input[0]))); 480: do { 481: if(d=='\n') break; 482: ctnew++; 483: } while(isspace(d=getc(input[1]))); 484: } 485: if(c!=d) { 486: jackpot++; 487: J[i] = 0; 488: if(c!='\n') 489: ctold += skipline(0); 490: if(d!='\n') 491: ctnew += skipline(1); 492: break; 493: } 494: if(c=='\n') 495: break; 496: } 497: ixold[i] = ctold; 498: ixnew[j] = ctnew; 499: j++; 500: } 501: for(;j<=len[1];j++) { 502: ixnew[j] = ctnew += skipline(1); 503: } 504: fclose(input[0]); 505: fclose(input[1]); 506: /* 507: if(jackpot) 508: mesg("jackpot",empty); 509: */ 510: } 511: 512: skipline(f) 513: { 514: register i; 515: for(i=1;getc(input[f])!='\n';i++) ; 516: return(i); 517: } 518: 519: output(argv) 520: char **argv; 521: { 522: int m; 523: register int i0, i1, j1; 524: int j0; 525: input[0] = fopen(argv[1],"r"); 526: input[1] = fopen(argv[2],"r"); 527: m = len[0]; 528: J[0] = 0; 529: J[m+1] = len[1]+1; 530: if(opt!=-1) for(i0=1;i0<=m;i0=i1+1) { 531: while(i0<=m&&J[i0]==J[i0-1]+1) i0++; 532: j0 = J[i0-1]+1; 533: i1 = i0-1; 534: while(i1<m&&J[i1+1]==0) i1++; 535: j1 = J[i1+1]-1; 536: J[i1] = j1; 537: change(i0,i1,j0,j1); 538: } else for(i0=m;i0>=1;i0=i1-1) { 539: while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--; 540: j0 = J[i0+1]-1; 541: i1 = i0+1; 542: while(i1>1&&J[i1-1]==0) i1--; 543: j1 = J[i1-1]+1; 544: J[i1] = j1; 545: change(i1,i0,j1,j0); 546: } 547: if(m==0) 548: change(1,0,1,len[1]); 549: } 550: 551: change(a,b,c,d) 552: { 553: if(a>b&&c>d) return; 554: anychange = 1; 555: if(opt!=1) { 556: range(a,b,","); 557: putchar(a>b?'a':c>d?'d':'c'); 558: if(opt!=-1) range(c,d,","); 559: } else { 560: putchar(a>b?'a':c>d?'d':'c'); 561: range(a,b," "); 562: } 563: putchar('\n'); 564: if(opt==0) { 565: fetch(ixold,a,b,input[0],"< "); 566: if(a<=b&&c<=d) prints("---\n"); 567: } 568: fetch(ixnew,c,d,input[1],opt==0?"> ":empty); 569: if(opt!=0&&c<=d) prints(".\n"); 570: } 571: 572: range(a,b,separator) 573: char *separator; 574: { 575: printf("%d", a>b?b:a); 576: if(a<b) { 577: printf("%s%d", separator, b); 578: } 579: } 580: 581: fetch(f,a,b,lb,s) 582: long *f; 583: FILE *lb; 584: char *s; 585: { 586: register int i, j; 587: register int nc; 588: for(i=a;i<=b;i++) { 589: fseek(lb,f[i-1],0); 590: nc = f[i]-f[i-1]; 591: prints(s); 592: for(j=0;j<nc;j++) 593: putchar(getc(lb)); 594: } 595: } 596: 597: /* hashing has the effect of 598: * arranging line in 7-bit bytes and then 599: * summing 1-s complement in 16-bit hunks 600: */ 601: 602: readhash(f) 603: FILE *f; 604: { 605: long sum; 606: register unsigned shift; 607: register space; 608: register t; 609: sum = 1; 610: space = 0; 611: if(!bflag) for(shift=0;(t=getc(f))!='\n';shift+=7) { 612: if(t==-1) 613: return(0); 614: sum += (long)t << (shift%=HALFLONG); 615: } 616: else for(shift=0;;) { 617: switch(t=getc(f)) { 618: case -1: 619: return(0); 620: case '\t': 621: case ' ': 622: space++; 623: continue; 624: default: 625: if(space) { 626: shift += 7; 627: space = 0; 628: } 629: sum += (long)t << (shift%=HALFLONG); 630: shift += 7; 631: continue; 632: case '\n': 633: break; 634: } 635: break; 636: } 637: sum = low(sum) + high(sum); 638: return((short)low(sum) + (short)high(sum)); 639: } 640: 641: mesg(s,t) 642: char *s, *t; 643: { 644: fprintf(stderr,"diff: %s%s\n",s,t); 645: }