1: /* m_getfld.c - read/parse a message */ 2: 3: #include "../h/mh.h" 4: #include <stdio.h> 5: #include "../zotnet/mts.h" 6: #include <ctype.h> 7: 8: 9: /* This module has a long and checkered history. First, it didn't burst 10: maildrops correctly because it considered two CTRL-A:s in a row to be 11: an inter-message delimiter. It really is four CTRL-A:s followed by a 12: newline. Unfortunately, MMDF will convert this delimiter *inside* a 13: message to a CTRL-B followed by three CTRL-A:s and a newline. This 14: caused the old version of m_getfld() to declare eom prematurely. The 15: fix was a lot slower than 16: 17: c == '\001' && peekc (iob) == '\001' 18: 19: but it worked, and to increase generality, UUCP style maildrops could 20: be parsed as well. Unfortunately the speed issue finally caught up with 21: us since this routine is at the very heart of MH. 22: 23: To speed things up considerably, the routine Eom() was made an auxilary 24: function called by the macro eom(). Unless we are bursting a maildrop, 25: the eom() macro returns FALSE saying we aren't at the end of the 26: message. 27: 28: The next thing to do is to read the mtstailor file and initialize 29: delimiter[] and delimlen accordingly... 30: 31: After mhl was made a built-in in msh, m_getfld() worked just fine 32: (using m_unknown() at startup). Until one day: a message which was 33: the result of a bursting was shown. Then, since the burst boundaries 34: aren't CTRL-A:s, m_getfld() would blinding plunge on past the boundary. 35: Very sad. The solution: introduce m_eomsbr(). This hook gets called 36: after the end of each line (since testing for eom involves an fseek()). 37: This worked fine, until one day: a message with no body portion arrived. 38: Then the 39: 40: while (eom (c = Getc (iob), iob)) 41: continue; 42: 43: loop caused m_getfld() to return FMTERR. So, that logic was changed to 44: check for (*eom_action) and act accordingly. 45: 46: This worked fine, until one day: someone didn't use four CTRL:A's as 47: their delimiters. So, the bullet got bit and we read mts.h and 48: continue to struggle on. It's not that bad though, since the only time 49: the code gets executed is when inc (or msh) calls it, and both of these 50: have already called mts_init(). 51: 52: ------------------------ 53: (Written by Van Jacobson for the mh6 m_getfld, January, 1986): 54: 55: This routine was accounting for 60% of the cpu time used by most mh 56: programs. I spent a bit of time tuning and it now accounts for <10% 57: of the time used. Like any heavily tuned routine, it's a bit 58: complex and you want to be sure you understand everything that it's 59: doing before you start hacking on it. Let me try to emphasize 60: that: every line in this atrocity depends on every other line, 61: sometimes in subtle ways. You should understand it all, in detail, 62: before trying to change any part. If you do change it, test the 63: result thoroughly (I use a hand-constructed test file that exercises 64: all the ways a header name, header body, header continuation, 65: header-body separator, body line and body eom can align themselves 66: with respect to a buffer boundary). "Minor" bugs in this routine 67: result in garbaged or lost mail. 68: 69: If you hack on this and slow it down, I, my children and my 70: children's children will curse you. 71: 72: This routine gets used on three different types of files: normal, 73: single msg files, "packed" unix or mmdf mailboxs (when used by inc) 74: and packed, directoried bulletin board files (when used by msh). 75: The biggest impact of different file types is in "eom" testing. The 76: code has been carefully organized to test for eom at appropriate 77: times and at no other times (since the check is quite expensive). 78: I have tried to arrange things so that the eom check need only be 79: done on entry to this routine. Since an eom can only occur after a 80: newline, this is easy to manage for header fields. For the msg 81: body, we try to efficiently search the input buffer to see if 82: contains the eom delimiter. If it does, we take up to the 83: delimiter, otherwise we take everything in the buffer. (The change 84: to the body eom/copy processing produced the most noticeable 85: performance difference, particularly for "inc" and "show".) 86: 87: There are three qualitatively different things this routine busts 88: out of a message: field names, field text and msg bodies. Field 89: names are typically short (~8 char) and the loop that extracts them 90: might terminate on a colon, newline or max width. I considered 91: using a Vax "scanc" to locate the end of the field followed by a 92: "bcopy" but the routine call overhead on a Vax is too large for this 93: to work on short names. If Berkeley ever makes "inline" part of the 94: C optimiser (so things like "scanc" turn into inline instructions) a 95: change here would be worthwhile. 96: 97: Field text is typically 60 - 100 characters so there's (barely) 98: a win in doing a routine call to something that does a "locc" 99: followed by a "bmove". About 30% of the fields have continuations 100: (usually the 822 "received:" lines) and each continuation generates 101: another routine call. "Inline" would be a big win here, as well. 102: 103: Messages, as of this writing, seem to come in two flavors: small 104: (~1K) and long (>2K). Most messages have 400 - 600 bytes of headers 105: so message bodies average at least a few hundred characters. 106: Assuming your system uses reasonably sized stdio buffers (1K or 107: more), this routine should be able to remove the body in large 108: (>500 byte) chunks. The makes the cost of a call to "bcopy" 109: small but there is a premium on checking for the eom in packed 110: maildrops. The eom pattern is always a simple string so we can 111: construct an efficient pattern matcher for it (e.g., a Vax "matchc" 112: instruction). Some thought went into recognizing the start of 113: an eom that has been split across two buffers. 114: 115: This routine wants to deal with large chunks of data so, rather 116: than "getc" into a local buffer, it uses stdio's buffer. If 117: you try to use it on a non-buffered file, you'll get what you 118: deserve. This routine "knows" that struct FILEs have a _ptr 119: and a _cnt to describe the current state of the buffer and 120: it knows that _filbuf ignores the _ptr & _cnt and simply fills 121: the buffer. If stdio on your system doesn't work this way, you 122: may have to make small changes in this routine. 123: 124: This routine also "knows" that an EOF indication on a stream is 125: "sticky" (i.e., you will keep getting EOF until you reposition the 126: stream). If your system doesn't work this way it is broken and you 127: should complain to the vendor. As a consequence of the sticky 128: EOF, this routine will never return any kind of EOF status when 129: there is data in "name" or "buf"). 130: */ 131: 132: 133: #define Getc(iob) getc(iob) 134: #define eom(c,iob) (msg_style != MS_DEFAULT && \ 135: (((c) == *msg_delim && m_Eom(c,iob)) ||\ 136: (eom_action && (*eom_action)(c)))) 137: 138: static char *matchc(); 139: static char *locc(); 140: 141: static char **pat_map; 142: 143: int msg_count = 0; /* disgusting hack for "inc" so it can 144: * know how many characters were stuffed 145: * in the buffer on the last call (see 146: * comments in uip/scansbr.c) */ 147: 148: int msg_style = MS_DEFAULT; 149: /* 150: * The "full" delimiter string for a packed maildrop consists 151: * of a newline followed by the actual delimiter. E.g., the 152: * full string for a Unix maildrop would be: "\n\nFrom ". 153: * "Fdelim" points to the start of the full string and is used 154: * in the BODY case of the main routine to search the buffer for 155: * a possible eom. Msg_delim points to the first character of 156: * the actual delim. string (i.e., fdelim+1). Edelim 157: * points to the 2nd character of actual delimiter string. It 158: * is used in m_Eom because the first character of the string 159: * has been read and matched before m_Eom is called. 160: */ 161: char *msg_delim = ""; 162: static char *fdelim; 163: static char *delimend; 164: static int fdelimlen; 165: static char *edelim; 166: static int edelimlen; 167: 168: static int (*eom_action) () = NULL; 169: 170: /* */ 171: 172: m_getfld (state, name, buf, bufsz, iob) 173: int state; 174: int bufsz; 175: char *name, 176: *buf; 177: register FILE *iob; 178: { 179: register char *cp; 180: register char *bp; 181: register int cnt; 182: register int c; 183: register int i; 184: register int j; 185: register char *ep; 186: register char *sp; 187: 188: if ((c = Getc(iob)) < 0) { 189: msg_count = 0; 190: *buf = 0; 191: return FILEEOF; 192: } 193: if (eom (c, iob)) { 194: if (! eom_action) { 195: /* flush null messages */ 196: while ((c = Getc(iob)) >= 0 && eom (c, iob)) 197: ; 198: if (c >= 0) 199: (void) ungetc(c, iob); 200: } 201: msg_count = 0; 202: *buf = 0; 203: return FILEEOF; 204: } 205: 206: switch (state) { 207: case FLDEOF: 208: case BODYEOF: 209: case FLD: 210: if (c == '\n' || c == '-') { 211: /* we hit the header/body separator */ 212: while (c != '\n' && (c = Getc(iob)) >= 0) 213: ; 214: 215: if (c < 0 || (c = Getc(iob)) < 0 || eom (c, iob)) { 216: if (! eom_action) { 217: /* flush null messages */ 218: while ((c = Getc(iob)) >= 0 && eom (c, iob)) 219: ; 220: if (c >= 0) 221: (void) ungetc(c, iob); 222: } 223: msg_count = 0; 224: *buf = 0; 225: return FILEEOF; 226: } 227: state = BODY; 228: goto body; 229: } 230: /* 231: * get the name of this component. take characters up 232: * to a ':', a newline or NAMESZ-1 characters, whichever 233: * comes first. 234: */ 235: cp = name; i = NAMESZ - 1; 236: for (;;) { 237: bp = sp = iob->_ptr - 1; 238: j = (cnt = iob->_cnt+1) < i ? cnt : i; 239: while ((c = *bp++) != ':' && c != '\n' && --j >= 0) 240: *cp++ = c; 241: 242: j = bp - sp; 243: if ((cnt -= j) <= 0) { 244: if (_filbuf(iob) == EOF) { 245: *cp = *buf = NULL; 246: advise (NULLCP, "eof encountered in field \"%s\"", 247: name); 248: return FMTERR; 249: } 250: } else { 251: iob->_ptr = bp + 1; 252: iob->_cnt = cnt - 1; 253: } 254: if (c == ':') 255: break; 256: 257: /* 258: * something went wrong. possibilities are: 259: * . hit a newline (error) 260: * . got more than namesz chars. (error) 261: * . hit the end of the buffer. (loop) 262: */ 263: if (c == '\n') { 264: *cp = *buf = NULL; 265: advise (NULLCP, "eol encountered in field \"%s\"", name); 266: state = FMTERR; 267: goto finish; 268: } 269: if ((i -= j) <= 0) { 270: *cp = *buf = NULL; 271: advise (NULLCP, "field name \"%s\" exceeds %d bytes", 272: name, NAMESZ - 1); 273: state = LENERR; 274: goto finish; 275: } 276: } 277: 278: while (isspace (*--cp) && cp >= name) 279: ; 280: *++cp = NULL; 281: /* fall through */ 282: 283: case FLDPLUS: 284: /* 285: * get (more of) the text of a field. take 286: * characters up to the end of this field (newline 287: * followed by non-blank) or bufsz-1 characters. 288: */ 289: cp = buf; i = bufsz-1; 290: for (;;) { 291: cnt = iob->_cnt++; bp = --iob->_ptr; 292: c = cnt < i ? cnt : i; 293: while (ep = locc( c, bp, '\n' )) { 294: /* 295: * if we hit the end of this field, return. 296: */ 297: if ((j = *++ep) != ' ' && j != '\t') { 298: j = ep - iob->_ptr; 299: (void) bcopy( iob->_ptr, cp, j); 300: iob->_ptr = ep; iob->_cnt -= j; 301: cp += j; 302: state = FLD; 303: goto finish; 304: } 305: c -= ep - bp; bp = ep; 306: } 307: /* 308: * end of input or dest buffer - copy what we've found. 309: */ 310: c += bp - iob->_ptr; 311: (void) bcopy( iob->_ptr, cp, c); 312: i -= c; cp += c; 313: if (i <= 0) { 314: /* the dest buffer is full */ 315: iob->_cnt -= c; iob->_ptr += c; 316: state = FLDPLUS; 317: break; 318: } 319: /* 320: * There's one character left in the input buffer. 321: * Copy it & fill the buffer. If the last char 322: * was a newline and the next char is not whitespace, 323: * this is the end of the field. Otherwise loop. 324: */ 325: --i; 326: *cp++ = j = *(iob->_ptr + c); 327: c = _filbuf(iob); 328: if (j == '\n' && c != ' ' && c != '\t') { 329: if (c != EOF) 330: --iob->_ptr, ++iob->_cnt; 331: state = FLD; 332: break; 333: } 334: } 335: break; 336: 337: case BODY: 338: body: 339: /* 340: * get the message body up to bufsz characters or the 341: * end of the message. Sleazy hack: if bufsz is negative 342: * we assume that we were called to copy directly into 343: * the output buffer and we don't add an eos. 344: */ 345: i = (bufsz < 0) ? -bufsz : bufsz-1; 346: bp = --iob->_ptr; cnt = ++iob->_cnt; 347: c = (cnt < i ? cnt : i); 348: if (msg_style != MS_DEFAULT && c > 1) { 349: /* 350: * packed maildrop - only take up to the (possible) 351: * start of the next message. This "matchc" should 352: * probably be a Boyer-Moore matcher for non-vaxen, 353: * particularly since we have the alignment table 354: * all built for the end-of-buffer test (next). 355: * But our vax timings indicate that the "matchc" 356: * instruction is 50% faster than a carefully coded 357: * B.M. matcher for most strings. (So much for elegant 358: * algorithms vs. brute force.) Since I (currently) 359: * run MH on a vax, we use the matchc instruction. --vj 360: */ 361: if (ep = matchc( fdelimlen, fdelim, c, bp ) ) 362: c = ep - bp + 1; 363: else { 364: /* 365: * There's no delim in the buffer but there may be 366: * a partial one at the end. If so, we want to leave 367: * it so the "eom" check on the next call picks it up. 368: * Use a modified Boyer-Moore matcher to make this 369: * check relatively cheap. The first "while" figures 370: * out what position in the pattern matches the last 371: * character in the buffer. The inner "while" matches 372: * the pattern against the buffer, backwards starting 373: * at that position. Note that unless the buffer 374: * ends with one of the characters in the pattern 375: * (excluding the first and last), we do only one test. 376: */ 377: sp = delimend; 378: ep = bp + c - 1; 379: while ((cp = pat_map[*ep]) < sp) { 380: ep = bp + c - 1; sp = cp; 381: while (*--ep == *--cp && cp > fdelim) 382: ; 383: if (cp == fdelim) { 384: if (*ep == *cp && ep > bp) 385: c = (ep - bp) + 1; 386: break; 387: } 388: } 389: } 390: } 391: (void) bcopy( bp, buf, c ); 392: iob->_cnt -= c; 393: iob->_ptr += c; 394: if (bufsz < 0) { 395: msg_count = c; 396: return (state); 397: } 398: cp = buf + c; 399: break; 400: 401: default: 402: adios (NULLCP, "m_getfld() called with bogus state of %d", state); 403: } 404: finish:; 405: *cp = NULL; 406: msg_count = cp - buf; 407: return (state); 408: } 409: 410: /* */ 411: 412: #ifdef RPATHS 413: static char unixbuf[BUFSIZ] = ""; 414: #endif RPATHS 415: 416: void m_unknown (iob) 417: register FILE *iob; 418: { 419: register int c; 420: register long pos; 421: char text[10]; 422: register char *cp; 423: 424: msg_style = MS_UNKNOWN; 425: 426: /* Figure out what the message delimitter string is for this 427: * maildrop. (This used to be part of m_Eom but I didn't like 428: * the idea of an "if" statement that could only succeed on the 429: * first call to m_Eom getting executed on each call, i.e., at 430: * every newline in the message). 431: * 432: * If the first line of the maildrop is a Unix "from" line, we say the 433: * style is UUCP and eat the rest of the line. Otherwise we say the style 434: * is MMDF & look for the delimiter string specified when MH was built 435: * (or from the mtstailor file). 436: */ 437: pos = ftell (iob); 438: if (fread (text, sizeof *text, 5, iob) == 5 439: && strncmp (text, "From ", 5) == 0) { 440: msg_style = MS_UUCP; 441: fdelim = "\n\nFrom "; 442: #ifndef RPATHS 443: while ((c = getc (iob)) != '\n' && c >= 0) 444: ; 445: #else RPATHS 446: cp = unixbuf; 447: while ((c = getc (iob)) != '\n') 448: *cp++ = c; 449: *cp = NULL; 450: #endif RPATHS 451: } else { 452: /* not a Unix style maildrop */ 453: (void) fseek (iob, pos, 0); 454: if (mmdlm2 == NULLCP || *mmdlm2 == NULL) 455: mmdlm2 = "\001\001\001\001\n"; 456: fdelim = (char *)malloc((unsigned)strlen(mmdlm2)+2); 457: *fdelim = '\n'; 458: (void)strcpy(fdelim+1, mmdlm2); 459: msg_style = MS_MMDF; 460: } 461: fdelimlen = strlen(fdelim); 462: msg_delim = fdelim+1; 463: edelim = msg_delim+1; 464: edelimlen = fdelimlen-2; 465: delimend = msg_delim + edelimlen; 466: if (edelimlen <= 1) 467: adios (NULLCP, "maildrop delimiter must be at least 2 bytes"); 468: /* 469: * build a Boyer-Moore end-position map for the matcher in m_getfld. 470: * N.B. - we don't match just the first char (since it's the newline 471: * separator) or the last char (since the matchc would have found it 472: * if it was a real delim). 473: */ 474: pat_map = (char **) malloc( 256 * sizeof (char *)); 475: for (c = 256; c--; ) 476: pat_map[c] = delimend + 1; 477: 478: for (cp = fdelim + 1; cp < delimend; cp++ ) 479: pat_map[*cp] = cp; 480: 481: if (msg_style == MS_MMDF) { 482: /* flush extra msg hdrs */ 483: while ((c = Getc(iob)) >= 0 && eom (c, iob)) 484: ; 485: if (c >= 0) 486: (void) ungetc(c, iob); 487: } 488: } 489: 490: 491: void m_eomsbr (action) 492: int (*action) (); 493: { 494: if (eom_action = action) { 495: msg_style = MS_MSH; 496: *msg_delim = 0; 497: fdelimlen = 1; 498: delimend = fdelim; 499: } else { 500: msg_style = MS_MMDF; 501: msg_delim = fdelim + 1; 502: fdelimlen = strlen (fdelim); 503: delimend = msg_delim + edelimlen; 504: } 505: } 506: 507: /* */ 508: 509: /* test for msg delimiter string */ 510: 511: int m_Eom (c, iob) 512: register int c; 513: register FILE *iob; 514: { 515: register long pos = 0L; 516: register int i; 517: char text[10]; 518: #ifdef RPATHS 519: register char *cp; 520: #endif RPATHS 521: 522: pos = ftell (iob); 523: if ((i = fread (text, sizeof *text, edelimlen, iob)) != edelimlen 524: || strncmp (text, edelim, edelimlen)) { 525: if (i == 0 && msg_style == MS_UUCP) 526: /* the final newline in the (brain damaged) unix-format 527: * maildrop is part of the delimitter - delete it. 528: */ 529: return 1; 530: 531: (void) fseek (iob, pos, 0); 532: return 0; 533: } 534: 535: if (msg_style == MS_UUCP) { 536: #ifndef RPATHS 537: while ((c = getc (iob)) != '\n') 538: if (c < 0) 539: break; 540: #else RPATHS 541: cp = unixbuf; 542: while ((c = getc (iob)) != '\n' && c >= 0) 543: *cp++ = c; 544: *cp = NULL; 545: #endif RPATHS 546: } 547: 548: return 1; 549: } 550: 551: /* */ 552: 553: #ifdef RPATHS 554: char *unixline () { 555: register char *cp, 556: *dp, 557: *pp; 558: static char unixfrom[BUFSIZ]; 559: 560: pp = unixfrom; 561: if (cp = dp = index (unixbuf, ' ')) { 562: while (cp = index (cp + 1, 'r')) 563: if (strncmp (cp, "remote from ", 12) == 0) { 564: *cp = NULL; 565: (void) sprintf (pp, "%s!", cp + 12); 566: pp += strlen (pp); 567: break; 568: } 569: if (cp == NULL) 570: cp = unixbuf + strlen (unixbuf); 571: if ((cp -= 25) >= dp) 572: *cp = NULL; 573: } 574: 575: (void) sprintf (pp, "%s\n", unixbuf); 576: unixbuf[0] = NULL; 577: return unixfrom; 578: } 579: #endif RPATHS 580: 581: /* */ 582: 583: #if (vax && !lint) 584: asm(".align 1"); 585: asm("_matchc: .word 0"); 586: asm(" movq 4(ap),r0"); 587: asm(" movq 12(ap),r2"); 588: asm(" matchc r0,(r1),r2,(r3)"); 589: asm(" beql 1f"); 590: asm(" movl 4(ap),r3"); 591: asm("1: subl3 4(ap),r3,r0"); 592: asm(" ret"); 593: #else 594: static char * 595: matchc( patln, pat, strln, str ) 596: int patln; 597: char *pat; 598: int strln; 599: register char *str; 600: { 601: register char *es = str + strln - patln; 602: register char *sp; 603: register char *pp; 604: register char *ep = pat + patln; 605: register char pc = *pat++; 606: 607: for(;;) { 608: while (pc != *str++) 609: if (str > es) 610: return 0; 611: 612: sp = str; pp = pat; 613: while (pp < ep && *sp++ == *pp++) 614: ; 615: if (pp >= ep) 616: return (--str); 617: } 618: } 619: #endif 620: 621: /* */ 622: 623: /* 624: * Locate character "term" in the next "cnt" characters of "src". 625: * If found, return its address, otherwise return 0. 626: */ 627: #if (vax && !lint) 628: asm(".align 1"); 629: asm("_locc: .word 0"); 630: asm(" movq 4(ap),r0"); 631: asm(" locc 12(ap),r0,(r1)"); 632: asm(" beql 1f"); 633: asm(" movl r1,r0"); 634: asm("1: ret"); 635: #else 636: static char * 637: locc( cnt, src, term ) 638: register int cnt; 639: register char *src; 640: register char term; 641: { 642: while (*src++ != term && --cnt > 0); 643: 644: return (cnt > 0 ? --src : NULLCP); 645: } 646: #endif 647: 648: /* */ 649: 650: #if !defined (BSD42) && !defined (bcopy) 651: int bcmp (b1, b2, length) 652: register char *b1, 653: *b2; 654: register int length; 655: { 656: while (length-- > 0) 657: if (*b1++ != *b2++) 658: return 1; 659: 660: return 0; 661: } 662: 663: 664: bcopy (b1, b2, length) 665: register char *b1, 666: *b2; 667: register int length; 668: { 669: while (length-- > 0) 670: *b2++ = *b1++; 671: } 672: 673: 674: bzero (b, length) 675: register char *b; 676: register int length; 677: { 678: while (length-- > 0) 679: *b++ = NULL; 680: } 681: #endif not BSD42 or SYS5