/* * aegis - project change supervisor * Copyright (C) 2001 Peter Miller; * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA. * * MANIFEST: functions to manipulate quoted_prints * * From RFC 1521... * * The Quoted-Printable encoding is intended to represent data * that largely consists of octets that correspond to printable * characters in the ASCII character set. It encodes the data in * such a way that the resulting octets are unlikely to be modified * by mail transport. If the data being encoded are mostly ASCII * text, the encoded form of the data remains largely recognizable * by humans. A body which is entirely ASCII may also be encoded * in Quoted-Printable to ensure the integrity of the data should * the message pass through a character- translating, and/or * line-wrapping gateway. * * In this encoding, octets are to be represented as determined by * the following rules: * * Rule #1: (General 8-bit representation) Any octet, except those * indicating a line break according to the newline convention * of the canonical (standard) form of the data being encoded, * may be represented by an "=" followed by a two digit hexadecimal * representation of the octet's value. The digits of the hexadecimal * alphabet, for this purpose, are "0123456789ABCDEF". Uppercase * letters must be used when sending hexadecimal data, though a * robust implementation may choose to recognize lowercase letters * on receipt. Thus, for example, the value 12 (ASCII form feed) * can be represented by "=0C", and the value 61 (ASCII EQUAL SIGN) * can be represented by "=3D". Except when the following rules * allow an alternative encoding, this rule is mandatory. * * Rule #2: (Literal representation) Octets with decimal values of * 33 through 60 inclusive, and 62 through 126, inclusive, MAY be * represented as the ASCII characters which correspond to those * octets (EXCLAMATION POINT through LESS THAN, and GREATER THAN * through TILDE, respectively). * * Rule #3: (White Space): Octets with values of 9 and 32 MAY be * represented as ASCII TAB (HT) and SPACE characters, respectively, * but MUST NOT be so represented at the end of an encoded line. Any * TAB (HT) or SPACE characters on an encoded line MUST thus be * followed on that line by a printable character. In particular, * an "=" at the end of an encoded line, indicating a soft line * break (see rule #5) may follow one or more TAB (HT) or SPACE * characters. It follows that an octet with value 9 or 32 appearing * at the end of an encoded line must be represented according * to Rule #1. This rule is necessary because some MTAs (Message * Transport Agents, programs which transport messages from one * user to another, or perform a part of such transfers) are known * to pad lines of text with SPACEs, and others are known to remove * "white space" characters from the end of a line. Therefore, when * decoding a Quoted-Printable body, any trailing white space on * a line must be deleted, as it will necessarily have been added * by intermediate transport agents. * * Rule #4 (Line Breaks): A line break in a text body, independent of * what its representation is following the canonical representation * of the data being encoded, must be represented by a (RFC 822) * line break, which is a CRLF sequence, in the Quoted-Printable * encoding. Since the canonical representation of types other than * text do not generally include the representation of line breaks, * no hard line breaks (i.e. line breaks that are intended to * be meaningful and to be displayed to the user) should occur * in the quoted-printable encoding of such types. Of course, * occurrences of "=0D", "=0A", "0A=0D" and "=0D=0A" will eventually * be encountered. In general, however, base64 is preferred over * quoted-printable for binary data. * * Note that many implementations may elect to encode the local * representation of various content types directly, as described * in Appendix G. In particular, this may apply to plain text * material on systems that use newline conventions other than * CRLF delimiters. Such an implementation is permissible, but the * generation of line breaks must be generalized to account for * the case where alternate representations of newline sequences * are used. * * Rule #5 (Soft Line Breaks): The Quoted-Printable encoding REQUIRES * that encoded lines be no more than 76 characters long. If longer * lines are to be encoded with the Quoted-Printable encoding, * 'soft' line breaks must be used. An equal sign as the last * character on a encoded line indicates such a non-significant * ('soft') line break in the encoded text. Thus if the "raw" * form of the line is a single unencoded line that says: * * Now's the time for all folk to come to the aid of their country. * * This can be represented, in the Quoted-Printable encoding, as * * Now's the time = * for all folk to come = * to the aid of their country. * * This provides a mechanism with which long lines are encoded in * such a way as to be restored by the user agent. The 76 character * limit does not count the trailing CRLF, but counts all other * characters, including any equal signs. * * Since the hyphen character ("-") is represented as itself * in the Quoted-Printable encoding, care must be taken, when * encapsulating a quoted-printable encoded body in a multipart * entity, to ensure that the encapsulation boundary does not * appear anywhere in the encoded body. (A good strategy is to * choose a boundary that includes a character sequence such as * "=_" which can never appear in a quoted- printable body. See * the definition of multipart messages later in this document.) * * NOTE: The quoted-printable encoding represents something * of a compromise between readability and reliability in * transport. Bodies encoded with the quoted-printable encoding * will work reliably over most mail gateways, but may not * work perfectly over a few gateways, notably those involving * translation into EBCDIC. (In theory, an EBCDIC gateway could * decode a quoted-printable body and re-encode it using base64, * but such gateways do not yet exist.) A higher level of confidence * is offered by the base64 Content-Transfer-Encoding. A way to get * reasonably reliable transport through EBCDIC gateways is to also * quote the ASCII characters * * !"#$@[\]^`{|}~ * * according to rule #1. See Appendix B for more information. * * Because quoted-printable data is generally assumed to be line- * oriented, it is to be expected that the representation of * the breaks between the lines of quoted printable data may * be altered in transport, in the same manner that plain text * mail has always been altered in Internet mail when passing * between systems with differing newline conventions. If such * alterations are likely to constitute a corruption of the data, * it is probably more sensible to use the base64 encoding rather * than the quoted-printable encoding. * * WARNING TO IMPLEMENTORS: If binary data are encoded in quoted- * printable, care must be taken to encode CR and LF characters as * "=0D" and "=0A", respectively. In particular, a CRLF sequence in * binary data should be encoded as "=0D=0A". Otherwise, if CRLF * were represented as a hard line break, it might be incorrectly * decoded on platforms with different line break conventions. * * For formalists, the syntax of quoted-printable data is described * by the following grammar: * * quoted-printable := ([*(ptext / SPACE / TAB) ptext] ["="] CRLF) * ; Maximum line length of 76 characters excluding CRLF * * ptext := octet / 127, =, SPACE, or TAB, * ; and is recommended for any characters not listed in * ; Appendix B as "mail-safe". */ #include #include #include #include typedef struct input_base64_ty input_base64_ty; struct input_base64_ty { input_ty inherited; input_ty *deeper; int close_on_close; int eof; long pos; }; static void input_quoted_printable_destructor _((input_ty *)); static void input_quoted_printable_destructor(fp) input_ty *fp; { input_base64_ty *this; this = (input_base64_ty *)fp; if (this->close_on_close) input_delete(this->deeper); this->deeper = 0; /* paranoia */ } static int hex _((int)); static int hex(c) int c; { switch (c) { default: return -1; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return (c - '0'); case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': return (c - 'A' + 10); case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': return (c - 'a' + 10); } } static long input_quoted_printable_read _((input_ty *, void *, size_t)); static long input_quoted_printable_read(fp, data, len) input_ty *fp; void *data; size_t len; { input_base64_ty *this; unsigned char *cp; unsigned char *end; long nbytes; this = (input_base64_ty *)fp; if (this->eof) return 0; cp = data; end = cp + len; while (cp < end) { int c; int n1, n2; c = input_getc(this->deeper); if (c < 0) { this->eof = 1; break; } if (c == ' ' || c == '\t') { static char *buffer; static size_t bufmax; size_t bufpos; size_t nchars; /* * We are supposed to suppress white space on * the ends of lines. This is because some * (non-unix, non-windows) mail transfer agents * add extra white space on the ends of lines. * (Our corresponding encoding escapes trailing * spaces and tabs.) */ bufpos = 0; for (;;) { /* * Stash this character (we may need it later) */ if (bufpos >= bufmax) { bufmax = bufmax * 2 + 8; buffer = mem_change_size(buffer, bufmax); } buffer[bufpos++] = c; /* * See what comes next. */ c = input_getc(this->deeper); if (c < 0) break; if (c == '\n') { *cp++ = '\n'; goto next_char; } if (c != ' ' && c != '\t') { input_ungetc(this->deeper, c); break; } } /* * Put as many of the buffered characters into * the output as possible. This means we won't * double handle them (actually, we would O(n**2) * handle them). */ nchars = end - cp; if (nchars > bufpos) nchars = bufpos; memcpy(cp, buffer, nchars); cp += nchars; /* * If there wasn't room, there is no help for it. * We will have to give the rest of the buffered * characters back. Hopefully next time will * be big enough for all of them. The * pathological case required >16KB of spaces and * tabs: unlikely. */ while (bufpos > nchars) { --bufpos; input_ungetc(this->deeper, buffer[bufpos]); } /* * Don't fall into the next statement, but start * this loop from the top. (We could have run * out of output buffer). */ continue; } /* * If this isn't an escape sequence, return the literal * character. */ if (c != '=') { *cp++ = c; continue; } /* * Grab two hex digits. If they aren't hex digits, * it is a format error. * * Except for trailing white space; that we ignore. */ c = input_getc(this->deeper); if (c < 0) break; if (c == ' ' || c == '\t') { for (;;) { c = input_getc(this->deeper); if (c == '\n') break; if (c != ' ' && c != '\t') { input_fatal_error(this->deeper, "quoted printable: invalid character"); /* NOTREACHED */ } } } if (c == '\n') continue; n1 = hex(c); if (n1 < 0) { input_fatal_error(fp, "quoted printable: invalid hex character"); /* NOTREACHED */ } c = input_getc(this->deeper); n2 = hex(c); if (n2 < 0) { input_fatal_error(fp, "quoted printable: invalid hex character"); /* NOTREACHED */ } *cp++ = ((n1 << 4) | n2); next_char: ; } nbytes = (cp - (unsigned char *)data); this->pos += nbytes; return nbytes; } static long input_quoted_printable_ftell _((input_ty *)); static long input_quoted_printable_ftell(deeper) input_ty *deeper; { input_base64_ty *this; this = (input_base64_ty *)deeper; return this->pos; } static struct string_ty *input_quoted_printable_name _((input_ty *)); static struct string_ty * input_quoted_printable_name(fp) input_ty *fp; { input_base64_ty *this; this = (input_base64_ty *)fp; return input_name(this->deeper); } static long input_quoted_printable_length _((input_ty *)); static long input_quoted_printable_length(fp) input_ty *fp; { return -1; } static input_vtbl_ty vtbl = { sizeof(input_base64_ty), input_quoted_printable_destructor, input_quoted_printable_read, input_quoted_printable_ftell, input_quoted_printable_name, input_quoted_printable_length, }; input_ty * input_quoted_printable(deeper, coc) input_ty *deeper; int coc; { input_ty *result; input_base64_ty *this; result = input_new(&vtbl); this = (input_base64_ty *)result; this->deeper = deeper; this->close_on_close = coc; this->eof = 0; this->pos = 0; return result; }