1: # include <stdio.h> 2: # include <ctype.h> 3: # include "strfile.h" 4: 5: /* 6: * This program takes a file composed of strings seperated by 7: * lines starting with two consecutive delimiting character (default 8: * character is '%') and creates another file which consists of a table 9: * describing the file (structure from "strfile.h"), a table of seek 10: * pointers to the start of the strings, and the strings, each terinated 11: * by a null byte. Usage: 12: * 13: * % strfile [ - ] [ -cC ] [ -sv ] [ -oir ] sourcefile [ datafile ] 14: * 15: * - - Give a usage summary useful for jogging the memory 16: * c - Change delimiting character from '%' to 'C' 17: * s - Silent. Give no summary of data processed at the end of 18: * the run. 19: * v - Verbose. Give summary of data processed. (Default) 20: * o - order the strings in alphabetic order 21: * i - if ordering, ignore case 22: * r - randomize the order of the strings 23: * 24: * Ken Arnold Sept. 7, 1978 -- 25: * 26: * Added method to indicate dividers. A "%-" will cause the address 27: * to be added to the structure in one of the pointer elements. 28: * 29: * Ken Arnold Nov., 1984 -- 30: * 31: * Added ordering options. 32: */ 33: 34: # define TRUE 1 35: # define FALSE 0 36: 37: # define DELIM_CH '-' 38: 39: typedef struct { 40: char first; 41: long pos; 42: } STR; 43: 44: char *Infile = NULL, /* input file name */ 45: Outfile[100] = "", /* output file name */ 46: Delimch = '%', /* delimiting character */ 47: *Usage[] = { /* usage summary */ 48: "usage: strfile [ - ] [ -cC ] [ -sv ] [ -oir ] inputfile [ datafile ]", 49: " - - Give this usage summary", 50: " c - Replace delimiting character with 'C'", 51: " s - Silent. Give no summary", 52: " v - Verbose. Give summary. (default)", 53: " o - order strings alphabetically", 54: " i - ignore case in ordering", 55: " r - randomize the order of the strings", 56: " Default \"datafile\" is inputfile.dat", 57: NULL 58: }; 59: 60: int Sflag = FALSE; /* silent run flag */ 61: int Oflag = FALSE; /* ordering flag */ 62: int Iflag = FALSE; /* ignore case flag */ 63: int Rflag = FALSE; /* randomize order flag */ 64: int Delim = 0; /* current delimiter number */ 65: 66: long *Seekpts; 67: 68: FILE *Sort_1, *Sort_2; /* pointers for sorting */ 69: 70: STRFILE Tbl; /* statistics table */ 71: 72: STR *Firstch; /* first chars of each string */ 73: 74: char *fgets(), *malloc(), *strcpy(), *strcat(); 75: 76: long ftell(); 77: 78: main(ac, av) 79: int ac; 80: char **av; 81: { 82: register char *sp, dc; 83: register long *lp; 84: register unsigned int curseek; /* number of strings */ 85: register long *seekpts, li; /* table of seek pointers */ 86: register FILE *inf, *outf; 87: register int first; 88: register char *nsp; 89: register STR *fp; 90: static char string[257]; 91: 92: getargs(ac, av); /* evalute arguments */ 93: 94: /* 95: * initial counting of input file 96: */ 97: 98: dc = Delimch; 99: if ((inf = fopen(Infile, "r")) == NULL) { 100: perror(Infile); 101: exit(-1); 102: } 103: for (curseek = 0; (sp = fgets(string, 256, inf)) != NULL; ) 104: if (*sp++ == dc && (*sp == dc || *sp == DELIM_CH)) 105: curseek++; 106: curseek++; 107: 108: /* 109: * save space at begginning of file for tables 110: */ 111: 112: if ((outf = fopen(Outfile, "w")) == NULL) { 113: perror(Outfile); 114: exit(-1); 115: } 116: 117: /* 118: * Allocate space for the pointers, adding one to the end so the 119: * length of the final string can be calculated. 120: */ 121: ++curseek; 122: seekpts = (long *) malloc(sizeof *seekpts * curseek); /* NOSTRICT */ 123: if (seekpts == NULL) { 124: perror("calloc"); 125: exit(-1); 126: } 127: if (Oflag) { 128: Firstch = (STR *) malloc(sizeof *Firstch * curseek); 129: if (Firstch == NULL) { 130: perror("calloc"); 131: exit(-1); 132: } 133: } 134: 135: (void) fseek(outf, (long) (sizeof Tbl + sizeof *seekpts * curseek), 0); 136: (void) fseek(inf, (long) 0, 0); /* goto start of input */ 137: 138: /* 139: * write the strings onto the file 140: */ 141: 142: Tbl.str_longlen = 0; 143: Tbl.str_shortlen = (unsigned int) 0xffffffff; 144: lp = seekpts; 145: first = Oflag; 146: *seekpts = ftell(outf); 147: fp = Firstch; 148: do { 149: sp = fgets(string, 256, inf); 150: if (sp == NULL || 151: (*sp == dc && (sp[1] == dc || sp[1] == DELIM_CH))) { 152: putc('\0', outf); 153: *++lp = ftell(outf); 154: li = ftell(outf) - lp[-1] - 1; 155: if (Tbl.str_longlen < li) 156: Tbl.str_longlen = li; 157: if (Tbl.str_shortlen > li) 158: Tbl.str_shortlen = li; 159: if (sp && sp[1] == DELIM_CH && Delim < MAXDELIMS) 160: Tbl.str_delims[Delim++] = lp - seekpts; 161: first = Oflag; 162: } 163: else { 164: if (first) { 165: for (nsp = sp; !isalnum(*nsp); nsp++) 166: continue; 167: if (Iflag && isupper(*nsp)) 168: fp->first = tolower(*nsp); 169: else 170: fp->first = *nsp; 171: fp->pos = *lp; 172: fp++; 173: first = FALSE; 174: } 175: fputs(sp, outf); 176: } 177: } while (sp != NULL); 178: 179: /* 180: * write the tables in 181: */ 182: 183: (void) fclose(inf); 184: Tbl.str_numstr = curseek - 1; 185: 186: if (Oflag) 187: do_order(seekpts, outf); 188: else if (Rflag) 189: randomize(seekpts); 190: 191: (void) fseek(outf, (long) 0, 0); 192: (void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf); 193: (void) fwrite((char *) seekpts, sizeof *seekpts, curseek, outf); 194: (void) fclose(outf); 195: 196: if (!Sflag) { 197: printf("\"%s\" converted to \"%s\"\n", Infile, Outfile); 198: if (curseek == 0) 199: puts("There was 1 string"); 200: else 201: printf("There were %u strings\n", curseek - 1); 202: printf("Longest string: %u byte%s\n", Tbl.str_longlen, 203: Tbl.str_longlen == 1 ? "" : "s"); 204: printf("Shortest string: %u byte%s\n", Tbl.str_shortlen, 205: Tbl.str_shortlen == 1 ? "" : "s"); 206: } 207: exit(0); 208: } 209: 210: /* 211: * This routine evaluates arguments from the command line 212: */ 213: getargs(ac, av) 214: register int ac; 215: register char **av; 216: { 217: register char *sp; 218: register int i; 219: register int bad, j; 220: 221: bad = 0; 222: for (i = 1; i < ac; i++) 223: if (*av[i] == '-' && av[i][1]) { 224: for (sp = &av[i][1]; *sp; sp++) 225: switch (*sp) { 226: case 'c': /* new delimiting char */ 227: if ((Delimch = *++sp) == '\0') { 228: --sp; 229: Delimch = *av[++i]; 230: } 231: if (Delimch <= 0 || Delimch > '~' || 232: Delimch == DELIM_CH) { 233: printf("bad delimiting character: '\\%o\n'", 234: Delimch); 235: bad++; 236: } 237: break; 238: case 's': /* silent */ 239: Sflag++; 240: break; 241: case 'v': /* verbose */ 242: Sflag = 0; 243: break; 244: case 'o': /* order strings */ 245: Oflag++; 246: break; 247: case 'i': /* ignore case in ordering */ 248: Iflag++; 249: break; 250: case 'r': /* ignore case in ordering */ 251: Rflag++; 252: break; 253: default: /* unknown flag */ 254: bad++; 255: printf("bad flag: '%c'\n", *sp); 256: break; 257: } 258: } 259: else if (*av[i] == '-') { 260: for (j = 0; Usage[j]; j++) 261: puts(Usage[j]); 262: exit(0); 263: } 264: else if (Infile) 265: (void) strcpy(Outfile, av[i]); 266: else 267: Infile = av[i]; 268: if (!Infile) { 269: bad++; 270: puts("No input file name"); 271: } 272: if (*Outfile == '\0' && !bad) { 273: (void) strcpy(Outfile, Infile); 274: (void) strcat(Outfile, ".dat"); 275: } 276: if (bad) { 277: puts("use \"strfile -\" to get usage"); 278: exit(-1); 279: } 280: } 281: 282: /* 283: * do_order: 284: * Order the strings alphabetically (possibly ignoring case). 285: */ 286: do_order(seekpts, outf) 287: long *seekpts; 288: FILE *outf; 289: { 290: register int i; 291: register long *lp; 292: register STR *fp; 293: extern int cmp_str(); 294: 295: (void) fflush(outf); 296: Sort_1 = fopen(Outfile, "r"); 297: Sort_2 = fopen(Outfile, "r"); 298: Seekpts = seekpts; 299: qsort((char *) Firstch, Tbl.str_numstr, sizeof *Firstch, cmp_str); 300: i = Tbl.str_numstr; 301: lp = seekpts; 302: fp = Firstch; 303: while (i--) 304: *lp++ = fp++->pos; 305: (void) fclose(Sort_1); 306: (void) fclose(Sort_2); 307: Tbl.str_flags |= STR_ORDERED; 308: } 309: 310: /* 311: * cmp_str: 312: * Compare two strings in the file 313: */ 314: cmp_str(p1, p2) 315: STR *p1, *p2; 316: { 317: register int c1, c2; 318: 319: c1 = p1->first; 320: c2 = p2->first; 321: if (c1 != c2) 322: return c1 - c2; 323: 324: (void) fseek(Sort_1, p1->pos, 0); 325: (void) fseek(Sort_2, p2->pos, 0); 326: 327: while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0') 328: continue; 329: while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0') 330: continue; 331: 332: while (c1 != '\0' && c2 != '\0') { 333: if (Iflag) { 334: if (isupper(c1)) 335: c1 = tolower(c1); 336: if (isupper(c2)) 337: c2 = tolower(c2); 338: } 339: if (c1 != c2) 340: return c1 - c2; 341: c1 = getc(Sort_1); 342: c2 = getc(Sort_2); 343: } 344: return c1 - c2; 345: } 346: 347: /* 348: * randomize: 349: * Randomize the order of the string table. We must be careful 350: * not to randomize across delimiter boundaries. All 351: * randomization is done within each block. 352: */ 353: randomize(seekpts) 354: register long *seekpts; 355: { 356: register int cnt, i, j, start; 357: register long tmp; 358: register long *origsp; 359: 360: Tbl.str_flags |= STR_RANDOM; 361: srnd(time((long *) NULL) + getpid()); 362: origsp = seekpts; 363: for (j = 0; j <= Delim; j++) { 364: 365: /* 366: * get the starting place for the block 367: */ 368: 369: if (j == 0) 370: start = 0; 371: else 372: start = Tbl.str_delims[j - 1]; 373: 374: /* 375: * get the ending point 376: */ 377: 378: if (j == Delim) 379: cnt = Tbl.str_numstr; 380: else 381: cnt = Tbl.str_delims[j]; 382: 383: /* 384: * move things around randomly 385: */ 386: 387: for (seekpts = &origsp[start]; cnt > start; cnt--, seekpts++) { 388: i = rnd(cnt - start); 389: tmp = seekpts[0]; 390: seekpts[0] = seekpts[i]; 391: seekpts[i] = tmp; 392: } 393: } 394: }