1: /* Prepare Tex index dribble output into an actual index. 2: Copyright (C) Richard M. Stallman 1984 3: 4: Permission is granted to anyone to make or distribute 5: verbatim copies of this program 6: provided that the copyright notice and this permission notice are preserved; 7: and provided that the recipient is not asked to waive or limit his right to 8: redistribute copies as permitted by this permission notice; 9: and provided that anyone possessing a machine-executable copy 10: is granted access to copy the source code, in machine-readable form, 11: in some reasonable manner. 12: 13: Permission is granted to distribute derived works or enhanced versions of 14: this program under the above conditions with the additional condition 15: that the entire derivative or enhanced work 16: must be covered by a permission notice identical to this one. 17: 18: Anything distributed as part of a package containing portions derived 19: from this program, which cannot in current practice perform its function 20: usefully in the absence of what was derived directly from this program, 21: is to be considered as forming, together with the latter, 22: a single work derived from this program, 23: which must be entirely covered by a permission notice identical to this one 24: in order for distribution of the package to be permitted. 25: 26: In other words, you are welcome to use, share and improve this program. 27: You are forbidden to forbid anyone else to use, share and improve 28: what you give them. Help stamp out software-hoarding! */ 29: 30: #include <stdio.h> 31: #include <sys/file.h> 32: 33: #ifndef L_XTND 34: #define L_XTND 2 35: #endif 36: 37: /* When sorting in core, this structure describes one line 38: and the position and length of its first keyfield. */ 39: 40: struct lineinfo 41: { 42: char *text; /* The actual text of the line */ 43: union 44: { /* The start of the key (for textual comparison) */ 45: char *text; 46: long number; /* or the numeric value (for numeric comparison) */ 47: } key; 48: long keylen; /* Length of key field */ 49: }; 50: 51: /* This structure describes a field to use as a sort key */ 52: 53: struct keyfield 54: { 55: int startwords; /* # words to skip */ 56: int startchars; /* and # additional chars to skip, to start of field */ 57: int endwords; /* similar, from beg (or end) of line, to find end of field */ 58: int endchars; 59: char ignore_blanks; /* Ignore spaces and tabs within the field */ 60: char fold_case; /* Convert upper case to lower before comparing */ 61: char reverse; /* Compare in reverse order */ 62: char numeric; /* Parse text as an integer and compare the integers */ 63: char positional; /* Sort according to position within the file */ 64: char braced; /* Count balanced-braced groupings as fields */ 65: }; 66: 67: /* Vector of keyfields to use */ 68: 69: struct keyfield keyfields[3]; 70: 71: /* Number of keyfields stored in that vector. */ 72: 73: int num_keyfields = 3; 74: 75: /* Vector of input file names, terminated with a zero (null pointer) */ 76: 77: char **infiles; 78: 79: /* Vector of corresponding output file names, or zero meaning default it */ 80: 81: char **outfiles; 82: 83: /* Length of `infiles' */ 84: 85: int num_infiles; 86: 87: /* Pointer to the array of pointers to lines being sorted */ 88: 89: char **linearray; 90: 91: /* The allocated length of `linearray'. */ 92: 93: long lines; 94: 95: /* Directory to use for temporary files */ 96: 97: char *tempdir; 98: 99: /* Start of filename to use for temporary files. It starts with a slash. */ 100: 101: char *tempbase; 102: 103: /* Number of last temporary file. */ 104: 105: int tempcount; 106: 107: /* Number of last temporary file already deleted. 108: Temporary files are deleted by `flush_tempfiles' in order of creation. */ 109: 110: int last_deleted_tempcount; 111: 112: /* During in-core sort, this points to the base of the data block 113: which contains all the lines of data. */ 114: 115: char *text_base; 116: 117: /* Additional command switches */ 118: 119: int keep_tempfiles; /* Nonzero means do not delete tempfiles -- for debugging */ 120: 121: /* Forward declarations of functions in this file */ 122: 123: void decode_command (); 124: void sort_in_core (); 125: void sort_offline (); 126: char **parsefile (); 127: char *find_field (); 128: char *find_pos (); 129: char *find_braced_pos (); 130: char *find_braced_end (); 131: void writelines (); 132: int compare_full (); 133: long readline (); 134: int merge_files (); 135: int merge_direct (); 136: char *concat (); 137: char *maketempname (); 138: void flush_tempfiles (); 139: char *tempcopy (); 140: 141: extern char *mktemp (); 142: 143: #define MAX_IN_CORE_SORT 500000 144: 145: int 146: main (argc, argv) 147: int argc; 148: char **argv; 149: { 150: int i; 151: 152: tempcount = 0; 153: last_deleted_tempcount = 0; 154: 155: /* Describe the kind of sorting to do. */ 156: /* The first keyfield uses the first braced field and folds case */ 157: keyfields[0].braced = 1; 158: keyfields[0].fold_case = 1; 159: keyfields[0].endwords = -1; 160: keyfields[0].endchars = -1; 161: /* The second keyfield uses the second braced field, numerically */ 162: keyfields[1].braced = 1; 163: keyfields[1].numeric = 1; 164: keyfields[1].startwords = 1; 165: keyfields[1].endwords = -1; 166: keyfields[1].endchars = -1; 167: /* The third keyfield (which is ignored while discarding duplicates) 168: compares the whole line */ 169: keyfields[2].endwords = -1; 170: keyfields[2].endchars = -1; 171: 172: decode_command (argc, argv); 173: 174: tempbase = mktemp (concat ("/txiXXXXXX", "", "")); 175: 176: /* Process input files completely, one by one. */ 177: 178: for (i = 0; i < num_infiles; i++) 179: { 180: int desc; 181: long ptr; 182: char *outfile; 183: char *p; 184: 185: desc = open (infiles[i], 0, 0); 186: if (desc < 0) pfatal_with_name (infiles[i]); 187: lseek (desc, 0, L_XTND); 188: ptr = tell (desc); 189: close (desc); 190: 191: outfile = outfiles[i]; 192: if (!outfile) 193: { 194: outfile = concat (infiles[i], "s", ""); 195: } 196: 197: if (ptr < MAX_IN_CORE_SORT) 198: /* Sort a small amount of data */ 199: sort_in_core (infiles[i], ptr, outfile); 200: else 201: sort_offline (infiles[i], ptr, outfile); 202: } 203: 204: flush_tempfiles (tempcount); 205: exit (0); 206: } 207: 208: /* This page decodes the command line arguments to set the parameter variables 209: and set up the vector of keyfields and the vector of input files */ 210: 211: void 212: decode_command (argc, argv) 213: int argc; 214: char **argv; 215: { 216: int i; 217: char **ip; 218: char **op; 219: 220: /* Store default values into parameter variables */ 221: 222: tempdir = "/tmp"; 223: keep_tempfiles = 0; 224: 225: /* Allocate argc input files, which must be enough. */ 226: 227: infiles = (char **) xmalloc (argc * sizeof (char *)); 228: outfiles = (char **) xmalloc (argc * sizeof (char *)); 229: ip = infiles; 230: op = outfiles; 231: 232: /* First find all switches that control the default kind-of-sort */ 233: 234: for (i = 1; i < argc; i++) 235: { 236: int tem = classify_arg (argv[i]); 237: char c; 238: char *p; 239: 240: if (tem <= 0) 241: { 242: *ip++ = argv[i]; 243: *op++ = 0; 244: continue; 245: } 246: if (tem > 1) 247: { 248: if (i + 1 == argc) 249: fatal ("switch %s given with no argument following it", argv[i]); 250: else if (!strcmp (argv[i], "-T")) 251: tempdir = argv[i + 1]; 252: else if (!strcmp (argv[i], "-o")) 253: *(op - 1) = argv[i + 1]; 254: i += tem - 1; 255: continue; 256: } 257: 258: p = &argv[i][1]; 259: while (c = *p++) 260: switch (c) 261: { 262: case 'k': 263: keep_tempfiles = 1; 264: break; 265: 266: default: 267: fatal ("invalid command switch %c", c); 268: } 269: switchdone: ; 270: } 271: 272: /* Record number of keyfields, terminate list of filenames */ 273: 274: num_infiles = ip - infiles; 275: *ip = 0; 276: } 277: 278: /* Return 0 for an argument that is not a switch; 279: for a switch, return 1 plus the number of following arguments that the switch swallows. 280: */ 281: 282: int 283: classify_arg (arg) 284: char *arg; 285: { 286: if (!strcmp (arg, "-T") || !strcmp (arg, "-o")) 287: return 2; 288: if (arg[0] == '-') 289: return 1; 290: return 0; 291: } 292: 293: /* Create a name for a temporary file */ 294: 295: char * 296: maketempname (count) 297: int count; 298: { 299: char tempsuffix[10]; 300: sprintf (tempsuffix, "%d", count); 301: return concat (tempdir, tempbase, tempsuffix); 302: } 303: 304: /* Delete all temporary files up to the specified count */ 305: 306: void 307: flush_tempfiles (to_count) 308: int to_count; 309: { 310: if (keep_tempfiles) return; 311: while (last_deleted_tempcount < to_count) 312: unlink (maketempname (++last_deleted_tempcount)); 313: } 314: 315: /* Copy an input file into a temporary file, and return the temporary file name */ 316: 317: #define BUFSIZE 1024 318: 319: char * 320: tempcopy (idesc) 321: int idesc; 322: { 323: char *outfile = maketempname (++tempcount); 324: int odesc; 325: char buffer[BUFSIZE]; 326: 327: odesc = open (outfile, O_WRONLY | O_CREAT, 0666); 328: 329: if (odesc < 0) pfatal_with_name (outfile); 330: 331: while (1) 332: { 333: int nread = read (idesc, buffer, BUFSIZE); 334: write (odesc, buffer, nread); 335: if (!nread) break; 336: } 337: 338: close (odesc); 339: 340: return outfile; 341: } 342: 343: /* Compare two lines, provided as pointers to pointers to text, 344: according to the specified set of keyfields */ 345: 346: int 347: compare_full (line1, line2) 348: char **line1, **line2; 349: { 350: int i; 351: 352: /* Compare using the first keyfield; 353: if that does not distinguish the lines, try the second keyfield; and so on. */ 354: 355: for (i = 0; i < num_keyfields; i++) 356: { 357: long length1, length2; 358: char *start1 = find_field (&keyfields[i], *line1, &length1); 359: char *start2 = find_field (&keyfields[i], *line2, &length2); 360: int tem = compare_field (&keyfields[i], start1, length1, *line1 - text_base, 361: start2, length2, *line2 - text_base); 362: if (tem) 363: { 364: if (keyfields[i].reverse) 365: return - tem; 366: return tem; 367: } 368: } 369: 370: return 0; /* Lines match exactly */ 371: } 372: 373: /* Compare two lines described by structures 374: in which the first keyfield is identified in advance. 375: For positional sorting, assumes that the order of the lines in core 376: reflects their nominal order. */ 377: 378: int 379: compare_prepared (line1, line2) 380: struct lineinfo *line1, *line2; 381: { 382: int i; 383: int tem; 384: char *text1, *text2; 385: 386: /* Compare using the first keyfield, which has been found for us already */ 387: if (keyfields->positional) 388: { 389: if (line1->text - text_base > line2->text - text_base) 390: tem = 1; 391: else 392: tem = -1; 393: } 394: else if (keyfields->numeric) 395: tem = line1->key.number - line2->key.number; 396: else 397: tem = compare_field (keyfields, line1->key.text, line1->keylen, 0, line2->key, line2->keylen, 0); 398: if (tem) 399: { 400: if (keyfields->reverse) 401: return - tem; 402: return tem; 403: } 404: 405: text1 = line1->text; 406: text2 = line2->text; 407: 408: /* Compare using the second keyfield; 409: if that does not distinguish the lines, try the third keyfield; and so on. */ 410: 411: for (i = 1; i < num_keyfields; i++) 412: { 413: long length1, length2; 414: char *start1 = find_field (&keyfields[i], text1, &length1); 415: char *start2 = find_field (&keyfields[i], text2, &length2); 416: int tem = compare_field (&keyfields[i], start1, length1, text1 - text_base, 417: start2, length2, text2 - text_base); 418: if (tem) 419: { 420: if (keyfields[i].reverse) 421: return - tem; 422: return tem; 423: } 424: } 425: 426: return 0; /* Lines match exactly */ 427: } 428: 429: /* Like compare_full but more general. 430: You can pass any strings, and you can say how many keyfields to use. 431: `pos1' and `pos2' should indicate the nominal positional ordering of 432: the two lines in the input. */ 433: 434: int 435: compare_general (str1, str2, pos1, pos2, use_keyfields) 436: char *str1, *str2; 437: long pos1, pos2; 438: int use_keyfields; 439: { 440: int i; 441: 442: /* Compare using the first keyfield; 443: if that does not distinguish the lines, try the second keyfield; and so on. */ 444: 445: for (i = 0; i < use_keyfields; i++) 446: { 447: long length1, length2; 448: char *start1 = find_field (&keyfields[i], str1, &length1); 449: char *start2 = find_field (&keyfields[i], str2, &length2); 450: int tem = compare_field (&keyfields[i], start1, length1, pos1, start2, length2, pos2); 451: if (tem) 452: { 453: if (keyfields[i].reverse) 454: return - tem; 455: return tem; 456: } 457: } 458: 459: return 0; /* Lines match exactly */ 460: } 461: 462: /* Find the start and length of a field in `str' according to `keyfield'. 463: A pointer to the starting character is returned, and the length 464: is stored into the int that `lengthptr' points to. */ 465: 466: char * 467: find_field (keyfield, str, lengthptr) 468: struct keyfield *keyfield; 469: char *str; 470: long *lengthptr; 471: { 472: char *start; 473: char *end; 474: char *(*fun) (); 475: 476: if (keyfield->braced) fun = find_braced_pos; 477: else fun = find_pos; 478: 479: start = ( *fun )(str, keyfield->startwords, keyfield->startchars, 480: keyfield->ignore_blanks); 481: if (keyfield->endwords < 0) 482: { 483: if (keyfield->braced) 484: end = find_braced_end (start); 485: else 486: { 487: end = start; 488: while (*end && *end != '\n') end++; 489: } 490: } 491: else 492: { 493: end = ( *fun )(str, keyfield->endwords, keyfield->endchars, 0); 494: if (end - str < start - str) end = start; 495: } 496: *lengthptr = end - start; 497: return start; 498: } 499: 500: /* Find a pointer to a specified place within `str', 501: skipping (from the beginning) `words' words and then `chars' chars. 502: If `ignore_blanks' is nonzero, we skip all blanks 503: after finding the specified word. */ 504: 505: char * 506: find_pos (str, words, chars, ignore_blanks) 507: char *str; 508: int words, chars; 509: int ignore_blanks; 510: { 511: int i; 512: char *p = str; 513: 514: for (i = 0; i < words; i++) 515: { 516: char c; 517: /* Find next bunch of nonblanks and skip them. */ 518: while ((c = *p) == ' ' || c == '\t') p++; 519: while ((c = *p) && c != '\n' && !(c == ' ' || c == '\t')) p++; 520: if (!*p || *p == '\n') return p; 521: } 522: 523: while (*p == ' ' || *p == '\t') p++; 524: 525: for (i = 0; i < chars; i++) 526: { 527: if (!*p || *p == '\n') break; 528: p++; 529: } 530: return p; 531: } 532: 533: /* Like find_pos but assumes that each field is surrounded by braces 534: and that braces within fields are balanced. */ 535: 536: char * 537: find_braced_pos (str, words, chars, ignore_blanks) 538: char *str; 539: int words, chars; 540: int ignore_blanks; 541: { 542: int i; 543: int bracelevel; 544: char *p = str; 545: char c; 546: 547: for (i = 0; i < words; i++) 548: { 549: bracelevel = 1; 550: while ((c = *p++) != '{' && c != '\n' && c); 551: if (c != '{') 552: return p - 1; 553: while (bracelevel) 554: { 555: c = *p++; 556: if (c == '{') bracelevel++; 557: if (c == '}') bracelevel--; 558: if (c == '\\') c = *p++; /* \ quotes braces and \ */ 559: if (c == 0 || c == '\n') return p-1; 560: } 561: } 562: 563: while ((c = *p++) != '{' && c != '\n' && c); 564: 565: if (c != '{') 566: return p-1; 567: 568: if (ignore_blanks) 569: while ((c = *p) == ' ' || c == '\t') p++; 570: 571: for (i = 0; i < chars; i++) 572: { 573: if (!*p || *p == '\n') break; 574: p++; 575: } 576: return p; 577: } 578: 579: /* Find the end of the balanced-brace field which starts at `str'. 580: The position returned is just before the closing brace. */ 581: 582: char * 583: find_braced_end (str) 584: char *str; 585: { 586: int bracelevel; 587: char *p = str; 588: char c; 589: 590: bracelevel = 1; 591: while (bracelevel) 592: { 593: c = *p++; 594: if (c == '{') bracelevel++; 595: if (c == '}') bracelevel--; 596: if (c == '\\') c = *p++; 597: if (c == 0 || c == '\n') return p-1; 598: } 599: return p - 1; 600: } 601: 602: /* Compare two fields (each specified as a start pointer and a character count) 603: according to `keyfield'. The sign of the value reports the relation between the fields */ 604: 605: int 606: compare_field (keyfield, start1, length1, pos1, start2, length2, pos2) 607: struct keyfield *keyfield; 608: char *start1; 609: long length1; 610: long pos1; 611: char *start2; 612: long length2; 613: long pos2; 614: { 615: if (keyfields->positional) 616: { 617: if (pos1 > pos2) 618: return 1; 619: else 620: return -1; 621: } 622: if (keyfield->numeric) 623: { 624: long value = atol (start1) - atol (start2); 625: if (value > 0) return 1; 626: if (value < 0) return -1; 627: return 0; 628: } 629: else 630: { 631: char *p1 = start1; 632: char *p2 = start2; 633: char *e1 = start1 + length1; 634: char *e2 = start2 + length2; 635: 636: int fold_case = keyfield->fold_case; 637: 638: while (1) 639: { 640: char c1, c2; 641: 642: if (p1 == e1) c1 = 0; 643: else c1 = *p1++; 644: if (p2 == e2) c2 = 0; 645: else c2 = *p2++; 646: 647: /* Ignore case of both if desired */ 648: 649: if (fold_case) 650: { 651: if (c1 >= 'A' && c1 <= 'Z') c1 = c1 + 040; 652: if (c2 >= 'A' && c2 <= 'Z') c2 = c2 + 040; 653: } 654: 655: /* Actually compare */ 656: 657: if (c1 != c2) return c1 - c2; 658: if (!c1) break; 659: } 660: return 0; 661: } 662: } 663: 664: /* A `struct linebuffer' is a structure which holds a line of text. 665: `readline' reads a line from a stream into a linebuffer 666: and works regardless of the length of the line. */ 667: 668: struct linebuffer 669: { 670: long size; 671: char *buffer; 672: }; 673: 674: /* Initialize a linebuffer for use */ 675: 676: void 677: initbuffer (linebuffer) 678: struct linebuffer *linebuffer; 679: { 680: linebuffer->size = 200; 681: linebuffer->buffer = (char *) xmalloc (200); 682: } 683: 684: /* Read a line of text from `stream' into `linebuffer'. 685: Return the length of the line. */ 686: 687: long 688: readline (linebuffer, stream) 689: struct linebuffer *linebuffer; 690: FILE *stream; 691: { 692: char *buffer = linebuffer->buffer; 693: char *p = linebuffer->buffer; 694: char *end = p + linebuffer->size; 695: 696: while (1) 697: { 698: int c = getc (stream); 699: if (p == end) 700: { 701: buffer = (char *) xrealloc (buffer, linebuffer->size *= 2); 702: p += buffer - linebuffer->buffer; 703: end += buffer - linebuffer->buffer; 704: linebuffer->buffer = buffer; 705: } 706: if (c < 0 || c == '\n') 707: { 708: *p = 0; 709: break; 710: } 711: *p++ = c; 712: } 713: 714: return p - buffer; 715: } 716: 717: /* Sort the input files together when they are too big to sort in core */ 718: 719: void 720: sort_offline (infile, nfiles, total, outfile) 721: char *infile; 722: long total; 723: char *outfile; 724: { 725: int ntemps = 2 * (total + MAX_IN_CORE_SORT - 1) / MAX_IN_CORE_SORT; /* More than enough */ 726: char **tempfiles = (char **) xmalloc (ntemps * sizeof (char *)); 727: FILE *istream = fopen (infile, "r"); 728: int i; 729: struct linebuffer lb; 730: long linelength; 731: 732: initbuffer (&lb); 733: 734: /* Read in one line of input data. */ 735: 736: linelength = readline (&lb, istream); 737: 738: /* Split up the input into `ntemps' temporary files, or maybe fewer, 739: and put the new files' names into `tempfiles' */ 740: 741: for (i = 0; i < ntemps; i++) 742: { 743: char *outname = maketempname (++tempcount); 744: FILE *ostream = fopen (outname, "w"); 745: long tempsize = 0; 746: 747: if (!ostream) pfatal_with_name (outname); 748: tempfiles[i] = outname; 749: 750: /* Copy lines into this temp file as long as it does not make file "too big" 751: or until there are no more lines. */ 752: 753: while (tempsize + linelength + 1 <= MAX_IN_CORE_SORT) 754: { 755: tempsize += linelength + 1; 756: fputs (lb.buffer, ostream); 757: putc ('\n', ostream); 758: 759: /* Read another line of input data. */ 760: 761: linelength = readline (&lb, istream); 762: if (!linelength && feof (istream)) break; 763: } 764: fclose (ostream); 765: if (feof (istream)) break; 766: } 767: 768: free (lb.buffer); 769: 770: /* Record number of temp files we actually needed. */ 771: 772: ntemps = i; 773: 774: /* Sort each tempfile into another tempfile. 775: Delete the first set of tempfiles and put the names of the second into `tempfiles' */ 776: 777: for (i = 0; i < ntemps; i++) 778: { 779: char *newtemp = maketempname (++tempcount); 780: sort_in_core (&tempfiles[i], 1, MAX_IN_CORE_SORT, newtemp); 781: if (!keep_tempfiles) 782: unlink (tempfiles[i]); 783: tempfiles[i] = newtemp; 784: } 785: 786: /* Merge the tempfiles together and indexify */ 787: 788: merge_files (tempfiles, ntemps, outfile); 789: } 790: 791: /* Sort `infile', whose size is `total', 792: assuming that is small enough to be done in-core, 793: then indexify it and send the output to `outfile' (or to stdout). */ 794: 795: void 796: sort_in_core (infile, total, outfile) 797: char *infile; 798: long total; 799: char *outfile; 800: { 801: char **nextline; 802: char *data = (char *) xmalloc (total + 1); 803: char *file_data; 804: long file_size; 805: int i; 806: FILE *ostream = stdout; 807: struct lineinfo *lineinfo; 808: 809: /* Read the contents of the file into the moby array `data' */ 810: 811: int desc = open (infile, 0, 0); 812: 813: if (desc < 0) 814: fatal ("failure reopening %s", infile); 815: file_size = read (desc, data, total); 816: file_data = data; 817: data[file_size] = 0; 818: 819: close (desc); 820: 821: /* Sort routines want to know this address */ 822: 823: text_base = data; 824: 825: /* Create the array of pointers to lines, with a default size frequently enough. */ 826: 827: lines = total / 50; 828: if (!lines) lines = 2; 829: linearray = (char **) xmalloc (lines * sizeof (char *)); 830: 831: /* `nextline' points to the next free slot in this array. 832: `lines' is the allocated size. */ 833: 834: nextline = linearray; 835: 836: /* Parse the input file's data, and make entries for the lines. */ 837: 838: nextline = parsefile (infile, nextline, file_data, file_size); 839: 840: /* Sort the lines */ 841: 842: /* If we have enough space, find the first keyfield of each line in advance. 843: Make a `struct lineinfo' for each line, which records the keyfield 844: as well as the line, and sort them. */ 845: 846: lineinfo = (struct lineinfo *) malloc ((nextline - linearray) * sizeof (struct lineinfo)); 847: 848: if (lineinfo) 849: { 850: struct lineinfo *lp; 851: char **p; 852: 853: for (lp = lineinfo, p = linearray; p != nextline; lp++, p++) 854: { 855: lp->text = *p; 856: lp->key.text = find_field (keyfields, *p, &lp->keylen); 857: if (keyfields->numeric) 858: lp->key.number = atol (lp->key.text); 859: } 860: 861: qsort (lineinfo, nextline - linearray, sizeof (struct lineinfo), compare_prepared); 862: 863: for (lp = lineinfo, p = linearray; p != nextline; lp++, p++) 864: *p = lp->text; 865: 866: free (lineinfo); 867: } 868: else 869: qsort (linearray, nextline - linearray, sizeof (char *), compare_full); 870: 871: /* Open the output file */ 872: 873: if (outfile) 874: { 875: ostream = fopen (outfile, "w"); 876: if (!ostream) 877: pfatal_with_name (outfile); 878: } 879: 880: writelines (linearray, nextline - linearray, ostream); 881: if (outfile) fclose (ostream); 882: 883: free (linearray); 884: free (data); 885: } 886: 887: /* Parse an input string in core into lines. 888: `data' is the input string, and `size' is its length. 889: Data goes in `linearray' starting at `nextline'. 890: The value returned is the first entry in `linearray' still unused. */ 891: 892: char ** 893: parsefile (filename, nextline, data, size) 894: char *filename; 895: char **nextline; 896: char *data; 897: long size; 898: { 899: char *p, *end; 900: char **line = nextline; 901: 902: p = data; 903: end = p + size; 904: *end = 0; 905: 906: while (p != end) 907: { 908: *line = p; 909: while (*p && *p != '\n') p++; 910: if (p != end) p++; 911: 912: /* This feature will be installed later. */ 913: /* if (discard_empty_lines && p == *line + 1) continue; */ 914: 915: line++; 916: if (line == linearray + lines) 917: { 918: char **old = linearray; 919: linearray = (char **) xrealloc (linearray, sizeof (char *) * (lines *= 4)); 920: line += linearray - old; 921: } 922: } 923: 924: return line; 925: } 926: 927: /* Indexification is a filter applied to the sorted lines 928: as they are being written to the output file. 929: Multiple entries for the same name, with different page numbers, 930: get combined into a single entry with multiple page numbers. 931: The first braced field, which is used for sorting, is discarded. 932: However, its first character is examined, folded to lower case, 933: and if it is different from that in the previous line fed to us 934: a \initial line is written with one argument, the new initial. 935: 936: If an entry has four braced fields, then the second and third 937: constitute primary and secondary names. 938: In this case, each change of primary name 939: generates a \primary line which contains only the primary name, 940: and in between these are \secondary lines which contain 941: just a secondary name and page numbers. 942: */ 943: 944: /* The last primary name we wrote a \primary entry for. 945: If only one level of indexing is being done, this is the last name seen */ 946: char *lastprimary; 947: int lastprimarylength; /* Length of storage allocated for lastprimary */ 948: 949: /* Similar, for the secondary name. */ 950: char *lastsecondary; 951: int lastsecondarylength; 952: 953: /* Zero if we are not in the middle of writing an entry. 954: One if we have written the beginning of an entry but have not 955: yet written any page numbers into it. 956: Greater than one if we have written the beginning of an entry 957: plus at least one page number. */ 958: int pending; 959: 960: /* The initial (for sorting purposes) of the last primary entry written. 961: When this changes, a \initial {c} line is written */ 962: 963: char * lastinitial; 964: 965: int lastinitiallength; 966: 967: /* When we need a string of length 1 for the value of lastinitial, 968: store it here. */ 969: 970: char lastinitial1[2]; 971: 972: /* Initialize static storage for writing an index */ 973: 974: void 975: init_index () 976: { 977: pending = 0; 978: lastinitial = lastinitial1; 979: lastinitial1[0] = 0; 980: lastinitial1[1] = 0; 981: lastinitiallength = 0; 982: lastprimarylength = 100; 983: lastprimary = (char *) xmalloc (lastprimarylength + 1); 984: bzero (lastprimary, lastprimarylength + 1); 985: lastsecondarylength = 100; 986: lastsecondary = (char *) xmalloc (lastsecondarylength + 1); 987: bzero (lastsecondary, lastsecondarylength + 1); 988: } 989: 990: /* Indexify. Merge entries for the same name, 991: insert headers for each initial character, etc. */ 992: 993: indexify (line, ostream) 994: char *line; 995: FILE *ostream; 996: { 997: char *primary, *secondary, *pagenumber; 998: int primarylength, secondarylength, pagelength; 999: int len = strlen (line); 1000: int nosecondary; 1001: int initiallength; 1002: char *initial; 1003: char initial1[2]; 1004: register char *p; 1005: 1006: /* First, analyze the parts of the entry fed to us this time */ 1007: 1008: p = find_braced_pos (line, 0, 0, 0); 1009: if (*p == '{') 1010: { 1011: initial = p; 1012: /* Get length of inner pair of braces starting at p, 1013: including that inner pair of braces. */ 1014: initiallength = find_braced_end (p + 1) + 1 - p; 1015: } 1016: else 1017: { 1018: initial = initial1; 1019: initial1[0] = *p; 1020: initial1[1] = 0; 1021: initiallength = 1; 1022: 1023: if (initial1[0] >= 'a' && initial1[0] <= 'z') 1024: initial1[0] -= 040; 1025: } 1026: 1027: pagenumber = find_braced_pos (line, 1, 0, 0); 1028: pagelength = find_braced_end (pagenumber) - pagenumber; 1029: if (pagelength == 0) 1030: abort (); 1031: 1032: primary = find_braced_pos (line, 2, 0, 0); 1033: primarylength = find_braced_end (primary) - primary; 1034: 1035: secondary = find_braced_pos (line, 3, 0, 0); 1036: nosecondary = !*secondary; 1037: if (!nosecondary) 1038: secondarylength = find_braced_end (secondary) - secondary; 1039: 1040: /* If the primary is different from before, make a new primary entry */ 1041: if (strncmp (primary, lastprimary, primarylength)) 1042: { 1043: /* Close off current secondary entry first, if one is open */ 1044: if (pending) 1045: { 1046: fputs ("}\n", ostream); 1047: pending = 0; 1048: } 1049: 1050: /* If this primary has a different initial, include an entry for the initial */ 1051: if (initiallength != lastinitiallength || strcmp (initial, lastinitial)) 1052: { 1053: fprintf (ostream, "\\initial {"); 1054: fwrite (initial, 1, initiallength, ostream); 1055: fprintf (ostream, "}\n", initial); 1056: if (initial == initial1) 1057: { 1058: lastinitial = lastinitial1; 1059: *lastinitial1 = *initial1; 1060: } 1061: else 1062: { 1063: lastinitial = initial; 1064: } 1065: lastinitiallength = initiallength; 1066: } 1067: 1068: /* Make the entry for the primary. */ 1069: if (nosecondary) 1070: fputs ("\\entry {", ostream); 1071: else 1072: fputs ("\\primary {", ostream); 1073: fwrite (primary, primarylength, 1, ostream); 1074: if (nosecondary) 1075: { 1076: fputs ("}{", ostream); 1077: pending = 1; 1078: } 1079: else 1080: fputs ("}\n", ostream); 1081: 1082: /* Record name of most recent primary */ 1083: if (lastprimarylength < primarylength) 1084: { 1085: lastprimarylength = primarylength + 100; 1086: lastprimary = (char *) xrealloc (lastprimary, 1087: 1 + lastprimarylength); 1088: } 1089: strncpy (lastprimary, primary, primarylength); 1090: lastprimary[primarylength] = 0; 1091: 1092: /* There is no current secondary within this primary, now */ 1093: lastsecondary[0] = 0; 1094: } 1095: 1096: /* Should not have an entry with no subtopic following one with a subtopic */ 1097: 1098: if (nosecondary && *lastsecondary) 1099: error ("entry %s follows an entry with a secondary name", line); 1100: 1101: /* Start a new secondary entry if necessary */ 1102: if (!nosecondary && strncmp (secondary, lastsecondary, secondarylength)) 1103: { 1104: if (pending) 1105: { 1106: fputs ("}\n", ostream); 1107: pending = 0; 1108: } 1109: 1110: /* Write the entry for the secondary. */ 1111: fputs ("\\secondary {", ostream); 1112: fwrite (secondary, secondarylength, 1, ostream); 1113: fputs ("}{", ostream); 1114: pending = 1; 1115: 1116: /* Record name of most recent secondary */ 1117: if (lastsecondarylength < secondarylength) 1118: { 1119: lastsecondarylength = secondarylength + 100; 1120: lastsecondary = (char *) xrealloc (lastsecondary, 1121: 1 + lastsecondarylength); 1122: } 1123: strncpy (lastsecondary, secondary, secondarylength); 1124: lastsecondary[secondarylength] = 0; 1125: } 1126: 1127: /* Here to add one more page number to the current entry */ 1128: if (pending++ != 1) 1129: fputs (", ", ostream); /* Punctuate first, if this is not the first */ 1130: fwrite (pagenumber, pagelength, 1, ostream); 1131: } 1132: 1133: /* Close out any unfinished output entry */ 1134: 1135: void 1136: finish_index (ostream) 1137: FILE *ostream; 1138: { 1139: if (pending) 1140: fputs ("}\n", ostream); 1141: free (lastprimary); 1142: free (lastsecondary); 1143: } 1144: 1145: /* Copy the lines in the sorted order. 1146: Each line is copied out of the input file it was found in. */ 1147: 1148: void 1149: writelines (linearray, nlines, ostream) 1150: char **linearray; 1151: int nlines; 1152: FILE *ostream; 1153: { 1154: char **stop_line = linearray + nlines; 1155: char **next_line; 1156: 1157: init_index (); 1158: 1159: /* Output the text of the lines, and free the buffer space */ 1160: 1161: for (next_line = linearray; next_line != stop_line; next_line++) 1162: { 1163: /* If -u was specified, output the line only if distinct from previous one. */ 1164: if (next_line == linearray 1165: /* Compare previous line with this one, using only the explicitly specd keyfields */ 1166: || compare_general (*(next_line - 1), *next_line, 0L, 0L, num_keyfields - 1)) 1167: { 1168: char *p = *next_line; 1169: char c; 1170: while ((c = *p++) && c != '\n'); 1171: *(p-1) = 0; 1172: indexify (*next_line, ostream); 1173: } 1174: } 1175: 1176: finish_index (ostream); 1177: } 1178: 1179: /* Assume (and optionally verify) that each input file is sorted; 1180: merge them and output the result. 1181: Returns nonzero if any input file fails to be sorted. 1182: 1183: This is the high-level interface that can handle an unlimited number of files. */ 1184: 1185: #define MAX_DIRECT_MERGE 10 1186: 1187: int 1188: merge_files (infiles, nfiles, outfile) 1189: char **infiles; 1190: int nfiles; 1191: char *outfile; 1192: { 1193: char **tempfiles; 1194: int ntemps; 1195: int i; 1196: int value = 0; 1197: int start_tempcount = tempcount; 1198: 1199: if (nfiles <= MAX_DIRECT_MERGE) 1200: return merge_direct (infiles, nfiles, outfile); 1201: 1202: /* Merge groups of MAX_DIRECT_MERGE input files at a time, 1203: making a temporary file to hold each group's result. */ 1204: 1205: ntemps = (nfiles + MAX_DIRECT_MERGE - 1) / MAX_DIRECT_MERGE; 1206: tempfiles = (char **) xmalloc (ntemps * sizeof (char *)); 1207: for (i = 0; i < ntemps; i++) 1208: { 1209: int nf = MAX_DIRECT_MERGE; 1210: if (i + 1 == ntemps) 1211: nf = nfiles - i * MAX_DIRECT_MERGE; 1212: tempfiles[i] = maketempname (++tempcount); 1213: value |= merge_direct (&infiles[i * MAX_DIRECT_MERGE], nf, tempfiles[i]); 1214: } 1215: 1216: /* All temporary files that existed before are no longer needed 1217: since their contents have been merged into our new tempfiles. 1218: So delete them. */ 1219: flush_tempfiles (start_tempcount); 1220: 1221: /* Now merge the temporary files we created. */ 1222: 1223: merge_files (tempfiles, ntemps, outfile); 1224: 1225: free (tempfiles); 1226: 1227: return value; 1228: } 1229: 1230: /* Assume (and optionally verify) that each input file is sorted; 1231: merge them and output the result. 1232: Returns nonzero if any input file fails to be sorted. 1233: 1234: This version of merging will not work if the number of 1235: input files gets too high. Higher level functions 1236: use it only with a bounded number of input files. */ 1237: 1238: int 1239: merge_direct (infiles, nfiles, outfile) 1240: char **infiles; 1241: int nfiles; 1242: char *outfile; 1243: { 1244: char **ip = infiles; 1245: struct linebuffer *lb1, *lb2; 1246: struct linebuffer **thisline, **prevline; 1247: FILE **streams; 1248: int i; 1249: int nleft; 1250: int lossage = 0; 1251: int *file_lossage; 1252: struct linebuffer *prev_out = 0; 1253: FILE *ostream = stdout; 1254: 1255: if (outfile) 1256: ostream = fopen (outfile, "w"); 1257: if (!ostream) pfatal_with_name (outfile); 1258: 1259: init_index (); 1260: 1261: if (nfiles == 0) 1262: { 1263: if (outfile) 1264: fclose (ostream); 1265: return 0; 1266: } 1267: 1268: /* For each file, make two line buffers. 1269: Also, for each file, there is an element of `thisline' 1270: which points at any time to one of the file's two buffers, 1271: and an element of `prevline' which points to the other buffer. 1272: `thisline' is supposed to point to the next available line from the file, 1273: while `prevline' holds the last file line used, 1274: which is remembered so that we can verify that the file is properly sorted. */ 1275: 1276: /* lb1 and lb2 contain one buffer each per file */ 1277: lb1 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer)); 1278: lb2 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer)); 1279: 1280: /* thisline[i] points to the linebuffer holding the next available line in file i, 1281: or is zero if there are no lines left in that file. */ 1282: thisline = (struct linebuffer **) xmalloc (nfiles * sizeof (struct linebuffer *)); 1283: /* prevline[i] points to the linebuffer holding the last used line from file i. 1284: This is just for verifying that file i is properly sorted. */ 1285: prevline = (struct linebuffer **) xmalloc (nfiles * sizeof (struct linebuffer *)); 1286: /* streams[i] holds the input stream for file i. */ 1287: streams = (FILE **) xmalloc (nfiles * sizeof (FILE *)); 1288: /* file_lossage[i] is nonzero if we already know file i is not properly sorted. */ 1289: file_lossage = (int *) xmalloc (nfiles * sizeof (int)); 1290: 1291: /* Allocate and initialize all that storage */ 1292: 1293: for (i = 0; i < nfiles; i++) 1294: { 1295: initbuffer (&lb1[i]); 1296: initbuffer (&lb2[i]); 1297: thisline[i] = &lb1[i]; 1298: prevline[i] = &lb2[i]; 1299: file_lossage[i] = 0; 1300: streams[i] = fopen (infiles[i], "r"); 1301: if (!streams[i]) 1302: pfatal_with_name (infiles[i]); 1303: 1304: readline (thisline[i], streams[i]); 1305: } 1306: 1307: /* Keep count of number of files not at eof */ 1308: nleft = nfiles; 1309: 1310: while (nleft) 1311: { 1312: struct linebuffer *best = 0; 1313: struct linebuffer *exch; 1314: int bestfile = -1; 1315: int i; 1316: 1317: /* Look at the next avail line of each file; choose the least one. */ 1318: 1319: for (i = 0; i < nfiles; i++) 1320: { 1321: if (thisline[i] && 1322: (!best || 1323: 0 < compare_general (best->buffer, thisline[i]->buffer, 1324: (long) bestfile, (long) i, num_keyfields))) 1325: { 1326: best = thisline[i]; 1327: bestfile = i; 1328: } 1329: } 1330: 1331: /* Output that line, unless it matches the previous one and we don't want duplicates */ 1332: 1333: if (!(prev_out && 1334: !compare_general (prev_out->buffer, best->buffer, 0L, 1L, num_keyfields - 1))) 1335: indexify (best->buffer, ostream); 1336: prev_out = best; 1337: 1338: /* Now make the line the previous of its file, and fetch a new line from that file */ 1339: 1340: exch = prevline[bestfile]; 1341: prevline[bestfile] = thisline[bestfile]; 1342: thisline[bestfile] = exch; 1343: 1344: while (1) 1345: { 1346: /* If the file has no more, mark it empty */ 1347: 1348: if (feof (streams[bestfile])) 1349: { 1350: thisline[bestfile] = 0; 1351: nleft--; /* Update the number of files still not empty */ 1352: break; 1353: } 1354: readline (thisline[bestfile], streams[bestfile]); 1355: if (thisline[bestfile]->buffer[0] || !feof (streams[bestfile])) break; 1356: } 1357: } 1358: 1359: finish_index (ostream); 1360: 1361: /* Free all storage and close all input streams */ 1362: 1363: for (i = 0; i < nfiles; i++) 1364: { 1365: fclose (streams[i]); 1366: free (lb1[i].buffer); 1367: free (lb2[i].buffer); 1368: } 1369: free (file_lossage); 1370: free (lb1); 1371: free (lb2); 1372: free (thisline); 1373: free (prevline); 1374: free (streams); 1375: 1376: if (outfile) 1377: fclose (ostream); 1378: 1379: return lossage; 1380: } 1381: 1382: /* Print error message and exit. */ 1383: 1384: fatal (s1, s2) 1385: char *s1, *s2; 1386: { 1387: error (s1, s2); 1388: exit (1); 1389: } 1390: 1391: /* Print error message. `s1' is printf control string, `s2' is arg for it. */ 1392: 1393: error (s1, s2) 1394: char *s1, *s2; 1395: { 1396: printf ("texi: "); 1397: printf (s1, s2); 1398: printf ("\n"); 1399: } 1400: 1401: perror_with_name (name) 1402: char *name; 1403: { 1404: extern int errno, sys_nerr; 1405: extern char *sys_errlist[]; 1406: char *s; 1407: 1408: if (errno < sys_nerr) 1409: s = concat ("", sys_errlist[errno], " for %s"); 1410: else 1411: s = "cannot open %s"; 1412: error (s, name); 1413: } 1414: 1415: pfatal_with_name (name) 1416: char *name; 1417: { 1418: extern int errno, sys_nerr; 1419: extern char *sys_errlist[]; 1420: char *s; 1421: 1422: if (errno < sys_nerr) 1423: s = concat ("", sys_errlist[errno], " for %s"); 1424: else 1425: s = "cannot open %s"; 1426: fatal (s, name); 1427: } 1428: 1429: /* Return a newly-allocated string whose contents concatenate those of s1, s2, s3. */ 1430: 1431: char * 1432: concat (s1, s2, s3) 1433: char *s1, *s2, *s3; 1434: { 1435: int len1 = strlen (s1), len2 = strlen (s2), len3 = strlen (s3); 1436: char *result = (char *) xmalloc (len1 + len2 + len3 + 1); 1437: 1438: strcpy (result, s1); 1439: strcpy (result + len1, s2); 1440: strcpy (result + len1 + len2, s3); 1441: *(result + len1 + len2 + len3) = 0; 1442: 1443: return result; 1444: } 1445: 1446: /* Like malloc but get fatal error if memory is exhausted. */ 1447: 1448: int 1449: xmalloc (size) 1450: int size; 1451: { 1452: int result = malloc (size); 1453: if (!result) 1454: fatal ("virtual memory exhausted", 0); 1455: return result; 1456: } 1457: 1458: 1459: int 1460: xrealloc (ptr, size) 1461: char *ptr; 1462: int size; 1463: { 1464: int result = realloc (ptr, size); 1465: if (!result) 1466: fatal ("virtual memory exhausted"); 1467: return result; 1468: }