1: static char sccsid[] = "@(#)diffreg.c 4.1 10/9/80"; 2: 3: #include "diff.h" 4: /* 5: * diff - compare two files. 6: */ 7: 8: /* 9: * Uses an algorithm due to Harold Stone, which finds 10: * a pair of longest identical subsequences in the two 11: * files. 12: * 13: * The major goal is to generate the match vector J. 14: * J[i] is the index of the line in file1 corresponding 15: * to line i file0. J[i] = 0 if there is no 16: * such line in file1. 17: * 18: * Lines are hashed so as to work in core. All potential 19: * matches are located by sorting the lines of each file 20: * on the hash (called ``value''). In particular, this 21: * collects the equivalence classes in file1 together. 22: * Subroutine equiv replaces the value of each line in 23: * file0 by the index of the first element of its 24: * matching equivalence in (the reordered) file1. 25: * To save space equiv squeezes file1 into a single 26: * array member in which the equivalence classes 27: * are simply concatenated, except that their first 28: * members are flagged by changing sign. 29: * 30: * Next the indices that point into member are unsorted into 31: * array class according to the original order of file0. 32: * 33: * The cleverness lies in routine stone. This marches 34: * through the lines of file0, developing a vector klist 35: * of "k-candidates". At step i a k-candidate is a matched 36: * pair of lines x,y (x in file0 y in file1) such that 37: * there is a common subsequence of length k 38: * between the first i lines of file0 and the first y 39: * lines of file1, but there is no such subsequence for 40: * any smaller y. x is the earliest possible mate to y 41: * that occurs in such a subsequence. 42: * 43: * Whenever any of the members of the equivalence class of 44: * lines in file1 matable to a line in file0 has serial number 45: * less than the y of some k-candidate, that k-candidate 46: * with the smallest such y is replaced. The new 47: * k-candidate is chained (via pred) to the current 48: * k-1 candidate so that the actual subsequence can 49: * be recovered. When a member has serial number greater 50: * that the y of all k-candidates, the klist is extended. 51: * At the end, the longest subsequence is pulled out 52: * and placed in the array J by unravel 53: * 54: * With J in hand, the matches there recorded are 55: * check'ed against reality to assure that no spurious 56: * matches have crept in due to hashing. If they have, 57: * they are broken, and "jackpot" is recorded--a harmless 58: * matter except that a true match for a spuriously 59: * mated line may now be unnecessarily reported as a change. 60: * 61: * Much of the complexity of the program comes simply 62: * from trying to minimize core utilization and 63: * maximize the range of doable problems by dynamically 64: * allocating what is needed and reusing what is not. 65: * The core requirements for problems larger than somewhat 66: * are (in words) 2*length(file0) + length(file1) + 67: * 3*(number of k-candidates installed), typically about 68: * 6n words for files of length n. 69: */ 70: 71: #define prints(s) fputs(s,stdout) 72: 73: FILE *input[2]; 74: FILE *fopen(); 75: 76: struct cand { 77: int x; 78: int y; 79: int pred; 80: } cand; 81: struct line { 82: int serial; 83: int value; 84: } *file[2], line; 85: int len[2]; 86: struct line *sfile[2]; /* shortened by pruning common prefix and suffix */ 87: int slen[2]; 88: int pref, suff; /* length of prefix and suffix */ 89: int *class; /* will be overlaid on file[0] */ 90: int *member; /* will be overlaid on file[1] */ 91: int *klist; /* will be overlaid on file[0] after class */ 92: struct cand *clist; /* merely a free storage pot for candidates */ 93: int clen = 0; 94: int *J; /* will be overlaid on class */ 95: long *ixold; /* will be overlaid on klist */ 96: long *ixnew; /* will be overlaid on file[1] */ 97: 98: diffreg() 99: { 100: register int k; 101: 102: if (hflag) { 103: diffargv[0] = "diffh"; 104: execv(diffh, diffargv); 105: fprintf(stderr, "diff: "); 106: perror(diffh); 107: done(); 108: } 109: dummy = malloc(1); 110: if ((stb1.st_mode & S_IFMT) == S_IFDIR) 111: file1 = splice(file1, file2); 112: else if ((stb2.st_mode & S_IFMT) == S_IFDIR) 113: file2 = splice(file2, file1); 114: else if (!strcmp(file1, "-")) { 115: if (!strcmp(file2, "-")) { 116: fprintf(stderr, "diff: can't specify - -\n"); 117: done(); 118: } 119: file1 = copytemp(); 120: } else if (!strcmp(file2, "-")) 121: file2 = copytemp(); 122: prepare(0, file1); 123: prepare(1, file2); 124: prune(); 125: sort(sfile[0],slen[0]); 126: sort(sfile[1],slen[1]); 127: 128: member = (int *)file[1]; 129: equiv(sfile[0], slen[0], sfile[1], slen[1], member); 130: member = (int *)ralloc((char *)member,(slen[1]+2)*sizeof(int)); 131: 132: class = (int *)file[0]; 133: unsort(sfile[0], slen[0], class); 134: class = (int *)ralloc((char *)class,(slen[0]+2)*sizeof(int)); 135: 136: klist = (int *)talloc((slen[0]+2)*sizeof(int)); 137: clist = (struct cand *)talloc(sizeof(cand)); 138: k = stone(class, slen[0], member, klist); 139: free((char *)member); 140: free((char *)class); 141: 142: J = (int *)talloc((len[0]+2)*sizeof(int)); 143: unravel(klist[k]); 144: free((char *)clist); 145: free((char *)klist); 146: 147: ixold = (long *)talloc((len[0]+2)*sizeof(long)); 148: ixnew = (long *)talloc((len[1]+2)*sizeof(long)); 149: check(); 150: output(); 151: status = anychange; 152: if (opt == D_CONTEXT && anychange == 0) 153: printf("No differences encountered\n"); 154: done(); 155: } 156: 157: char * 158: copytemp() 159: { 160: char buf[BUFSIZ]; 161: register int i, f; 162: 163: signal(SIGHUP,done); 164: signal(SIGINT,done); 165: signal(SIGPIPE,done); 166: signal(SIGTERM,done); 167: tempfile = mktemp("/tmp/dXXXXX"); 168: f = creat(tempfile,0600); 169: if (f < 0) { 170: fprintf(stderr, "diff: "); 171: perror(tempfile); 172: done(); 173: } 174: while ((i = read(0,buf,BUFSIZ)) > 0) 175: if (write(f,buf,i) != i) { 176: fprintf(stderr, "diff: "); 177: perror(tempfile); 178: done(); 179: } 180: close(f); 181: return (tempfile); 182: } 183: 184: char * 185: splice(dir, file) 186: char *dir, *file; 187: { 188: char *tail; 189: char buf[BUFSIZ]; 190: 191: if (!strcmp(file, "-")) { 192: fprintf(stderr, "diff: can't specify - with other arg directory\n"); 193: done(); 194: } 195: tail = rindex(file, '/'); 196: if (tail == 0) 197: tail = file; 198: else 199: tail++; 200: sprintf(buf, "%s/%s", dir, tail); 201: return (savestr(buf)); 202: } 203: 204: prepare(i, arg) 205: char *arg; 206: { 207: register struct line *p; 208: register j,h; 209: if((input[i] = fopen(arg,"r")) == NULL){ 210: fprintf(stderr, "diff: "); 211: perror(arg); 212: done(); 213: } 214: p = (struct line *)talloc(3*sizeof(line)); 215: for(j=0; h=readhash(input[i]);) { 216: p = (struct line *)ralloc((char *)p,(++j+3)*sizeof(line)); 217: p[j].value = h; 218: } 219: len[i] = j; 220: file[i] = p; 221: fclose(input[i]); 222: } 223: 224: prune() 225: { 226: register i,j; 227: for(pref=0;pref<len[0]&&pref<len[1]&& 228: file[0][pref+1].value==file[1][pref+1].value; 229: pref++ ) ; 230: for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&& 231: file[0][len[0]-suff].value==file[1][len[1]-suff].value; 232: suff++) ; 233: for(j=0;j<2;j++) { 234: sfile[j] = file[j]+pref; 235: slen[j] = len[j]-pref-suff; 236: for(i=0;i<=slen[j];i++) 237: sfile[j][i].serial = i; 238: } 239: } 240: 241: equiv(a,n,b,m,c) 242: struct line *a, *b; 243: int *c; 244: { 245: register int i, j; 246: i = j = 1; 247: while(i<=n && j<=m) { 248: if(a[i].value <b[j].value) 249: a[i++].value = 0; 250: else if(a[i].value == b[j].value) 251: a[i++].value = j; 252: else 253: j++; 254: } 255: while(i <= n) 256: a[i++].value = 0; 257: b[m+1].value = 0; 258: j = 0; 259: while(++j <= m) { 260: c[j] = -b[j].serial; 261: while(b[j+1].value == b[j].value) { 262: j++; 263: c[j] = b[j].serial; 264: } 265: } 266: c[j] = -1; 267: } 268: 269: stone(a,n,b,c) 270: int *a; 271: int *b; 272: int *c; 273: { 274: register int i, k,y; 275: int j, l; 276: int oldc, tc; 277: int oldl; 278: k = 0; 279: c[0] = newcand(0,0,0); 280: for(i=1; i<=n; i++) { 281: j = a[i]; 282: if(j==0) 283: continue; 284: y = -b[j]; 285: oldl = 0; 286: oldc = c[0]; 287: do { 288: if(y <= clist[oldc].y) 289: continue; 290: l = search(c, k, y); 291: if(l!=oldl+1) 292: oldc = c[l-1]; 293: if(l<=k) { 294: if(clist[c[l]].y <= y) 295: continue; 296: tc = c[l]; 297: c[l] = newcand(i,y,oldc); 298: oldc = tc; 299: oldl = l; 300: } else { 301: c[l] = newcand(i,y,oldc); 302: k++; 303: break; 304: } 305: } while((y=b[++j]) > 0); 306: } 307: return(k); 308: } 309: 310: newcand(x,y,pred) 311: { 312: register struct cand *q; 313: clist = (struct cand *)ralloc((char *)clist,++clen*sizeof(cand)); 314: q = clist + clen -1; 315: q->x = x; 316: q->y = y; 317: q->pred = pred; 318: return(clen-1); 319: } 320: 321: search(c, k, y) 322: int *c; 323: { 324: register int i, j, l; 325: int t; 326: if(clist[c[k]].y<y) /*quick look for typical case*/ 327: return(k+1); 328: i = 0; 329: j = k+1; 330: while((l=(i+j)/2) > i) { 331: t = clist[c[l]].y; 332: if(t > y) 333: j = l; 334: else if(t < y) 335: i = l; 336: else 337: return(l); 338: } 339: return(l+1); 340: } 341: 342: unravel(p) 343: { 344: register int i; 345: register struct cand *q; 346: for(i=0; i<=len[0]; i++) 347: J[i] = i<=pref ? i: 348: i>len[0]-suff ? i+len[1]-len[0]: 349: 0; 350: for(q=clist+p;q->y!=0;q=clist+q->pred) 351: J[q->x+pref] = q->y+pref; 352: } 353: 354: /* check does double duty: 355: 1. ferret out any fortuitous correspondences due 356: to confounding by hashing (which result in "jackpot") 357: 2. collect random access indexes to the two files */ 358: 359: check() 360: { 361: register int i, j; 362: int jackpot; 363: long ctold, ctnew; 364: char c,d; 365: input[0] = fopen(file1,"r"); 366: input[1] = fopen(file2,"r"); 367: j = 1; 368: ixold[0] = ixnew[0] = 0; 369: jackpot = 0; 370: ctold = ctnew = 0; 371: for(i=1;i<=len[0];i++) { 372: if(J[i]==0) { 373: ixold[i] = ctold += skipline(0); 374: continue; 375: } 376: while(j<J[i]) { 377: ixnew[j] = ctnew += skipline(1); 378: j++; 379: } 380: for(;;) { 381: c = getc(input[0]); 382: d = getc(input[1]); 383: ctold++; 384: ctnew++; 385: if(bflag && isspace(c) && isspace(d)) { 386: do { 387: if(c=='\n') break; 388: ctold++; 389: } while(isspace(c=getc(input[0]))); 390: do { 391: if(d=='\n') break; 392: ctnew++; 393: } while(isspace(d=getc(input[1]))); 394: } 395: if(c!=d) { 396: jackpot++; 397: J[i] = 0; 398: if(c!='\n') 399: ctold += skipline(0); 400: if(d!='\n') 401: ctnew += skipline(1); 402: break; 403: } 404: if(c=='\n') 405: break; 406: } 407: ixold[i] = ctold; 408: ixnew[j] = ctnew; 409: j++; 410: } 411: for(;j<=len[1];j++) { 412: ixnew[j] = ctnew += skipline(1); 413: } 414: fclose(input[0]); 415: fclose(input[1]); 416: /* 417: if(jackpot) 418: fprintf(stderr, "jackpot\n"); 419: */ 420: } 421: 422: sort(a,n) /*shellsort CACM #201*/ 423: struct line *a; 424: { 425: struct line w; 426: register int j,m; 427: struct line *ai; 428: register struct line *aim; 429: int k; 430: for(j=1;j<=n;j*= 2) 431: m = 2*j - 1; 432: for(m/=2;m!=0;m/=2) { 433: k = n-m; 434: for(j=1;j<=k;j++) { 435: for(ai = &a[j]; ai > a; ai -= m) { 436: aim = &ai[m]; 437: if(aim < ai) 438: break; /*wraparound*/ 439: if(aim->value > ai[0].value || 440: aim->value == ai[0].value && 441: aim->serial > ai[0].serial) 442: break; 443: w.value = ai[0].value; 444: ai[0].value = aim->value; 445: aim->value = w.value; 446: w.serial = ai[0].serial; 447: ai[0].serial = aim->serial; 448: aim->serial = w.serial; 449: } 450: } 451: } 452: } 453: 454: unsort(f, l, b) 455: struct line *f; 456: int *b; 457: { 458: register int *a; 459: register int i; 460: a = (int *)talloc((l+1)*sizeof(int)); 461: for(i=1;i<=l;i++) 462: a[f[i].serial] = f[i].value; 463: for(i=1;i<=l;i++) 464: b[i] = a[i]; 465: free((char *)a); 466: } 467: 468: skipline(f) 469: { 470: register i; 471: for(i=1;getc(input[f])!='\n';i++) ; 472: return(i); 473: } 474: 475: output() 476: { 477: int m; 478: register int i0, i1, j1; 479: int j0; 480: input[0] = fopen(file1,"r"); 481: input[1] = fopen(file2,"r"); 482: m = len[0]; 483: J[0] = 0; 484: J[m+1] = len[1]+1; 485: if(opt!=D_EDIT) for(i0=1;i0<=m;i0=i1+1) { 486: while(i0<=m&&J[i0]==J[i0-1]+1) i0++; 487: j0 = J[i0-1]+1; 488: i1 = i0-1; 489: while(i1<m&&J[i1+1]==0) i1++; 490: j1 = J[i1+1]-1; 491: J[i1] = j1; 492: change(i0,i1,j0,j1); 493: } else for(i0=m;i0>=1;i0=i1-1) { 494: while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--; 495: j0 = J[i0+1]-1; 496: i1 = i0+1; 497: while(i1>1&&J[i1-1]==0) i1--; 498: j1 = J[i1-1]+1; 499: J[i1] = j1; 500: change(i1,i0,j1,j0); 501: } 502: if(m==0) 503: change(1,0,1,len[1]); 504: if (opt==D_IFDEF) { 505: for (;;) { 506: #define c i0 507: c = getc(input[0]); 508: if (c < 0) 509: return; 510: putchar(c); 511: } 512: #undef c 513: } 514: } 515: 516: /* indicate that there is a difference between lines a and b of the from file 517: to get to lines c to d of the to file. 518: If a is greater then b then there are no lines in the from file involved 519: and this means that there were lines appended (beginning at b). 520: If c is greater than d then there are lines missing from the to file. 521: */ 522: change(a,b,c,d) 523: { 524: char ch; 525: int lowa,upb,lowc,upd; 526: struct stat stbuf; 527: 528: if (opt != D_IFDEF && a>b && c>d) 529: return; 530: if (anychange == 0) { 531: anychange = 1; 532: if(opt == D_CONTEXT) { 533: printf("*** %s ", file1); 534: stat(file1, &stbuf); 535: printf("%s--- %s ", 536: ctime(&stbuf.st_mtime), file2); 537: stat(file2, &stbuf); 538: printf("%s", ctime(&stbuf.st_mtime)); 539: } 540: } 541: if (a <= b && c <= d) 542: ch = 'c'; 543: else 544: ch = (a <= b) ? 'd' : 'a'; 545: if(opt == D_CONTEXT) { 546: lowa = max(1, a-context); 547: upb = min(len[0], b+context); 548: lowc = max(1, c-context); 549: upd = min(len[1], d+context); 550: 551: /* print out from file */ 552: printf("***************\n*** "); 553: range(lowa,upb,","); 554: printf("\n"); 555: if (ch == 'a') 556: fetch(ixold,lowa,upb,input[0]," "); 557: else { 558: fetch(ixold,lowa,a-1,input[0]," "); 559: fetch(ixold,a,b,input[0],ch == 'c' ? "! " : "- "); 560: fetch(ixold,b+1,upb,input[0]," "); 561: } 562: putchar('\n'); 563: printf("--- "); 564: range(lowc,upd,","); 565: printf(" -----\n"); 566: if (ch == 'd') 567: fetch(ixnew,lowc,upd,input[1]," "); 568: else { 569: fetch(ixnew,lowc,c-1,input[1]," "); 570: fetch(ixnew,c,d,input[1],ch == 'c' ? "! " : "+ "); 571: fetch(ixnew,d+1,upd,input[1]," "); 572: } 573: return; 574: } 575: switch (opt) { 576: 577: case D_NORMAL: 578: case D_EDIT: 579: range(a,b,","); 580: putchar(a>b?'a':c>d?'d':'c'); 581: if(opt==D_NORMAL) 582: range(c,d,","); 583: putchar('\n'); 584: break; 585: case D_REVERSE: 586: putchar(a>b?'a':c>d?'d':'c'); 587: range(a,b," "); 588: putchar('\n'); 589: break; 590: } 591: if(opt == D_NORMAL || opt == D_IFDEF) { 592: fetch(ixold,a,b,input[0],"< ", 1); 593: if(a<=b&&c<=d && opt == D_NORMAL) 594: prints("---\n"); 595: } 596: fetch(ixnew,c,d,input[1],opt==D_NORMAL?"> ":"", 0); 597: if ((opt ==D_EDIT || opt == D_REVERSE) && c<=d) 598: prints(".\n"); 599: if (inifdef) { 600: fprintf(stdout, "#endif %s\n", endifname); 601: inifdef = 0; 602: } 603: } 604: 605: range(a,b,separator) 606: char *separator; 607: { 608: printf("%d", a>b?b:a); 609: if(a<b) { 610: printf("%s%d", separator, b); 611: } 612: } 613: 614: fetch(f,a,b,lb,s,oldfile) 615: long *f; 616: FILE *lb; 617: char *s; 618: { 619: register int i, j; 620: register int nc; 621: int oneflag = (*ifdef1!='\0') != (*ifdef2!='\0'); 622: 623: /* 624: * When doing #ifdef's, copy down to current line 625: * if this is the first file, so that stuff makes it to output. 626: */ 627: if (opt == D_IFDEF && oldfile){ 628: long curpos = ftell(lb); 629: /* print through if append (a>b), else to (nb: 0 vs 1 orig) */ 630: nc = f[a>b? b : a-1 ] - curpos; 631: for (i = 0; i < nc; i++) 632: putchar(getc(lb)); 633: } 634: if (a > b) 635: return; 636: if (opt == D_IFDEF) { 637: if (inifdef) 638: fprintf(stdout, "#else %s%s\n", oneflag && oldfile==1 ? "!" : "", ifdef2); 639: else { 640: if (oneflag) { 641: /* There was only one ifdef given */ 642: endifname = ifdef2; 643: if (oldfile) 644: fprintf(stdout, "#ifndef %s\n", endifname); 645: else 646: fprintf(stdout, "#ifdef %s\n", endifname); 647: } 648: else { 649: endifname = oldfile ? ifdef1 : ifdef2; 650: fprintf(stdout, "#ifdef %s\n", endifname); 651: } 652: } 653: inifdef = 1+oldfile; 654: } 655: for(i=a;i<=b;i++) { 656: fseek(lb,f[i-1],0); 657: nc = f[i]-f[i-1]; 658: if (opt != D_IFDEF) 659: prints(s); 660: for(j=0;j<nc;j++) 661: putchar(getc(lb)); 662: } 663: if (inifdef && !wantelses) { 664: fprintf(stdout, "#endif %s\n", endifname); 665: inifdef = 0; 666: } 667: } 668: 669: #define HALFLONG 16 670: #define low(x) (x&((1L<<HALFLONG)-1)) 671: #define high(x) (x>>HALFLONG) 672: 673: /* 674: * hashing has the effect of 675: * arranging line in 7-bit bytes and then 676: * summing 1-s complement in 16-bit hunks 677: */ 678: readhash(f) 679: FILE *f; 680: { 681: long sum; 682: register unsigned shift; 683: register space; 684: register t; 685: sum = 1; 686: space = 0; 687: if(!bflag) for(shift=0;(t=getc(f))!='\n';shift+=7) { 688: if(t==-1) 689: return(0); 690: sum += (long)t << (shift%=HALFLONG); 691: } 692: else for(shift=0;;) { 693: switch(t=getc(f)) { 694: case -1: 695: return(0); 696: case '\t': 697: case ' ': 698: space++; 699: continue; 700: default: 701: if(space) { 702: shift += 7; 703: space = 0; 704: } 705: sum += (long)t << (shift%=HALFLONG); 706: shift += 7; 707: continue; 708: case '\n': 709: break; 710: } 711: break; 712: } 713: sum = low(sum) + high(sum); 714: return((short)low(sum) + (short)high(sum)); 715: }