1: /* diff - differential file comparison 2: * 3: * Uses an algorithm due to Harold Stone, which finds 4: * a pair of longest identical subsequences in the two 5: * files. 6: * 7: * The major goal is to generate the match vector J. 8: * J[i] is the index of the line in file1 corresponding 9: * to line i file0. J[i] = 0 if there is no 10: * such line in file1. 11: * 12: * Lines are hashed so as to work in core. All potential 13: * matches are located by sorting the lines of each file 14: * on the hash (called value_____). In particular, this 15: * collects the equivalence classes in file1 together. 16: * Subroutine equiv____ replaces the value of each line in 17: * file0 by the index of the first element of its 18: * matching equivalence in (the reordered) file1. 19: * To save space equiv_____ squeezes file1 into a single 20: * array member______ in which the equivalence classes 21: * are simply concatenated, except that their first 22: * members are flagged by changing sign. 23: * 24: * Next the indices that point into member______ are unsorted_______ into 25: * array class_____ according to the original order of file0. 26: * 27: * The cleverness lies in routine stone______. This marches 28: * through the lines of file0, developing a vector klist_____ 29: * of "k-candidates". At step i a k-candidate is a matched 30: * pair of lines x,y (x in file0 y in file1) such that 31: * there is a common subsequence of lenght k 32: * between the first i lines of file0 and the first y 33: * lines of file1, but there is no such subsequence for 34: * any smaller y. x is the earliest possible mate to y 35: * that occurs in such a subsequence. 36: * 37: * Whenever any of the members of the equivalence class of 38: * lines in file1 matable to a line in file0 has serial number 39: * less than the y of some k-candidate, that k-candidate 40: * with the smallest such y is replaced. The new 41: * k-candidate is chained (via pred____) to the current 42: * k-1 candidate so that the actual subsequence can 43: * be recovered. When a member has serial number greater 44: * that the y of all k-candidates, the klist is extended. 45: * At the end, the longest subsequence is pulled out 46: * and placed in the array J by unravel_______. 47: * 48: * With J in hand, the matches there recorded are 49: * check_____ed against reality to assure that no spurious 50: * matches have crept in due to hashing. If they have, 51: * they are broken, and "jackpot " is recorded--a harmless 52: * matter except that a true match for a spuriously 53: * mated line may now be unnecessarily reported as a change. 54: * 55: * Much of the complexity of the program comes simply 56: * from trying to minimize core utilization and 57: * maximize the range of doable problems by dynamically 58: * allocating what is needed and reusing what is not. 59: * The core requirements for problems larger than somewhat 60: * are (in words) 2*length(file0) + length(file1) + 61: * 3*(number of k-candidates installed), typically about 62: * 6n words for files of length n. There is also space for buf1 63: * used which could, by moving data underfoot and reallocating 64: * buf1 together with buf2, be completely overlaid. 65: */ 66: struct buf { 67: int fdes; 68: char data[516]; 69: } *buf1, *buf2; 70: 71: struct cand { 72: int x; 73: int y; 74: struct cand *pred; 75: } cand; 76: struct line { 77: int serial; 78: int value; 79: } *file[2], line; 80: int len[2]; 81: int *class; /*will be overlaid on file[0]*/ 82: int *member; /*will be overlaid on file[1]*/ 83: struct cand **klist; /*will be overlaid on file[0] after class*/ 84: int *J; /*will be overlaid on class*/ 85: int *ixold; /*will be overlaid on klist*/ 86: int *ixnew; /*will be overlaid on file[1]*/ 87: 88: char *area; 89: char *top; 90: alloc(n) 91: { 92: register char *p; 93: p = area; 94: n = (n+1) & ~1; 95: area =+ n; 96: while(area > top) { 97: if(sbrk(1024) == -1) { 98: mesg("Out of space\n"); 99: exit(1); 100: } 101: top =+ 1024; 102: } 103: return(p); 104: } 105: 106: mesg(s) 107: char *s; 108: { 109: while(*s) 110: write(2,s++,1); 111: } 112: 113: sort(a,n) /*shellsort CACM #201*/ 114: struct line *a; 115: { 116: struct line w; 117: register int j,m; 118: struct line *ai; 119: register struct line *aim; 120: int k; 121: for(j=1;j<=n;j=* 2) 122: m = 2*j - 1; 123: for(m=/2;m!=0;m=/2) { 124: k = n-m; 125: for(j=1;j<=k;j++) { 126: for(ai = &a[j]; ai > a; ai =- m) { 127: aim = &ai[m]; 128: if(aim->value > ai[0].value || 129: aim->value == ai[0].value && 130: aim->serial > ai[0].serial) 131: break; 132: w.value = ai[0].value; 133: ai[0].value = aim->value; 134: aim->value = w.value; 135: w.serial = ai[0].serial; 136: ai[0].serial = aim->serial; 137: aim->serial = w.serial; 138: } 139: } 140: } 141: } 142: 143: unsort(f, l, b) 144: struct line *f; 145: int *b; 146: { 147: int *a; 148: int i; 149: a = alloc((l+1)*sizeof(a[0])); 150: for(i=1;i<=l;i++) 151: a[f[i].serial] = f[i].value; 152: for(i=1;i<=l;i++) 153: b[i] = a[i]; 154: area = a; 155: } 156: 157: prepare(i, arg) 158: char *arg; 159: { 160: register char *temp; 161: temp = file[i] = area; 162: alloc(sizeof(line)); 163: input(arg); 164: len[i] = (area - temp)/sizeof(line) - 1; 165: alloc(sizeof(line)); 166: sort(file[i], len[i]); 167: } 168: 169: input(arg) 170: { 171: register int h, i; 172: register struct line *p; 173: if(fopen(arg,buf1) == -1) { 174: mesg("Cannot open "); 175: mesg(arg); 176: mesg("\n"); 177: exit(1); 178: } 179: for(i=0; h=readhash(buf1);) { 180: p = alloc(sizeof(line)); 181: p->serial = ++i; 182: p->value = h; 183: } 184: close(buf1->fdes); 185: } 186: 187: equiv(a,n,b,m,c) 188: struct line *a, *b; 189: int *c; 190: { 191: register int i, j; 192: i = j = 1; 193: while(i<=n && j<=m) { 194: if(a[i].value <b[j].value) 195: a[i++].value = 0; 196: else if(a[i].value == b[j].value) 197: a[i++].value = j; 198: else 199: j++; 200: } 201: while(i <= n) 202: a[i++].value = 0; 203: b[m+1].value = 0; 204: j = 0; 205: while(++j <= m) { 206: c[j] = -b[j].serial; 207: while(b[j+1].value == b[j].value) { 208: j++; 209: c[j] = b[j].serial; 210: } 211: } 212: c[j] = -1; 213: } 214: 215: main(argc, argv) 216: char **argv; 217: { 218: int k; 219: 220: if(argc>1 && *argv[1]=='-') { 221: argc--; 222: argv++; 223: } 224: if(argc!=3) { 225: mesg("Arg count\n"); 226: exit(1); 227: } 228: 229: area = top = sbrk(0); 230: buf1 = alloc(sizeof(*buf1)); 231: prepare(0, argv[1]); 232: prepare(1, argv[2]); 233: 234: member = file[1]; 235: equiv(file[0], len[0], file[1], len[1], member); 236: 237: class = file[0]; 238: unsort(file[0], len[0], class); 239: 240: klist = &class[len[0]+2]; 241: area = &member[len[1]+2]; 242: k = stone(class, len[0], member, klist); 243: J = class; 244: unravel(klist[k]); 245: 246: ixold = klist; 247: ixnew = file[1]; 248: area = &ixnew[len[1]+2]; 249: buf2 = alloc(sizeof(*buf2)); 250: if(check(argv)) 251: mesg("Jackpot\n"); 252: output(argv); 253: } 254: 255: stone(a,n,b,c) 256: int *a; 257: int *b; 258: struct cand **c; 259: { 260: register int i, k,y; 261: int j, l; 262: int skip; 263: k = 0; 264: c[0] = 0; 265: for(i=1; i<=n; i++) { 266: j = a[i]; 267: if(j==0) 268: continue; 269: skip = 0; 270: do { 271: y = b[j]; 272: if(y<0) y = -y; 273: if(skip) 274: continue; 275: l = search(c, k, y); 276: if(l > k) { 277: c[k+1] = newcand(i,y,c[k]); 278: skip = 1; 279: k++; 280: } 281: else if(c[l]->y > y && c[l]->x < i) 282: c[l] = newcand(i,y,c[l-1]); 283: } while(b[++j] > 0); 284: } 285: return(k); 286: } 287: 288: struct cand * 289: newcand(x,y,pred) 290: struct cand *pred; 291: { 292: struct cand *p; 293: p = alloc(sizeof(cand)); 294: p->x = x; 295: p->y = y; 296: p->pred = pred; 297: return(p); 298: } 299: 300: search(c, k, y) 301: struct cand **c; 302: { 303: register int i, j, l; 304: int t; 305: i = 0; 306: j = k+1; 307: while((l=(i+j)/2) > i) { 308: t = c[l]->y; 309: if(t > y) 310: j = l; 311: else if(t < y) 312: i = l; 313: else 314: return(l); 315: } 316: return(l+1); 317: } 318: 319: unravel(p) 320: struct cand *p; 321: { 322: int i; 323: for(i=0; i<=len[0]; i++) 324: J[i] = 0; 325: while(p) { 326: J[p->x] = p->y; 327: p = p->pred; 328: } 329: } 330: 331: /* check does double duty: 332: 1. ferret out any fortuitous correspondences due 333: to counfounding by hashing (which result in "jackpot") 334: 2. collect random access indexes to the two files */ 335: 336: check(argv) 337: char **argv; 338: { 339: register int i, j; 340: int ctold, ctnew; 341: int jackpot; 342: char c,d; 343: fopen(argv[1],buf1); 344: fopen(argv[2],buf2); 345: j = 1; 346: ctold = ctnew = 0; 347: ixold[0] = ixnew[0] = 0; 348: jackpot = 0; 349: for(i=1;i<=len[0];i++) { 350: if(J[i]==0) { 351: while(getc(buf1)!='\n') ctold++; 352: ixold[i] = ++ctold; 353: continue; 354: } 355: while(j<J[i]) { 356: while(getc(buf2)!='\n') ctnew++; 357: ixnew[j] = ++ctnew; 358: j++; 359: } 360: while((c=getc(buf1))==(d=getc(buf2))) { 361: if(c=='\n') break; 362: ctold++; 363: ctnew++; 364: } 365: while(c!='\n') { 366: jackpot++; 367: J[i] = 0; 368: c = getc(buf1); 369: ctold++; 370: } 371: ixold[i] = ++ctold; 372: while(d!='\n') { 373: jackpot++; 374: J[i] = 0; 375: d = getc(buf2); 376: ctnew++; 377: } 378: ixnew[j] = ++ctnew; 379: j++; 380: } 381: for(;j<=len[1];j++) { 382: while(getc(buf2)!='\n') ctnew++; 383: ixnew[j] = ++ctnew; 384: } 385: close(buf1->fdes); 386: close(buf2->fdes); 387: return(jackpot); 388: } 389: 390: output(argv) 391: char **argv; 392: { 393: int dir; 394: int m; 395: int i0,i1,j0,j1; 396: extern fout; 397: dir = **argv=='-'; 398: fout = dup(1); 399: buf1->fdes = open(argv[1],0); 400: buf2->fdes = open(argv[2],0); 401: m = len[0]; 402: J[0] = 0; 403: J[m+1] = len[1]+1; 404: if(dir==0) for(i0=1;i0<=m;i0=i1+1) { 405: while(i0<=m&&J[i0]==J[i0-1]+1) i0++; 406: j0 = J[i0-1]+1; 407: i1 = i0-1; 408: while(i1<m&&J[i1+1]==0) i1++; 409: j1 = J[i1+1]-1; 410: J[i1] = j1; 411: change(i0,i1,j0,j1,dir); 412: } else for(i0=m;i0>=1;i0=i1-1) { 413: while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--; 414: j0 = J[i0+1]-1; 415: i1 = i0+1; 416: while(i1>1&&J[i1-1]==0) i1--; 417: j1 = J[i1-1]+1; 418: J[i1] = j1; 419: change(i1,i0,j1,j0,dir); 420: } 421: if(m==0) 422: change(1,0,1,len[1],dir); 423: flush(); 424: } 425: 426: change(a,b,c,d,dir) 427: { 428: if(a>b&&c>d) return; 429: range(a,b); 430: putchar(a>b?'a':c>d?'d':'c'); 431: if(dir==0) range(c,d); 432: putchar('\n'); 433: if(dir==0) { 434: fetch(ixold,a,b,buf1,"* "); 435: if(a<=b&&c<=d) printf("---\n"); 436: } 437: fetch(ixnew,c,d,buf2,dir==0?". ":""); 438: if(dir!=0&&c<=d) printf(".\n"); 439: } 440: 441: range(a,b) 442: { 443: if(a>b) printf("%d",b); 444: if(a<=b) printf("%d",a); 445: if(a<b) printf(",%d",b); 446: } 447: 448: fetch(f,a,b,lb,pref) 449: int *f; 450: struct buf *lb; 451: char *pref; 452: { 453: int i, j; 454: int nc; 455: for(i=a;i<=b;i++) { 456: seek(lb->fdes,f[i-1],0); 457: nc = read(lb->fdes,lb->data,f[i]-f[i-1]); 458: printf(pref); 459: for(j=0;j<nc;j++) 460: putchar(lb->data[j]); 461: } 462: }