// // aegis - project change supervisor // Copyright (C) 2001-2006, 2008 Peter Miller // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see // . // // From RFC 1521... // // The Quoted-Printable encoding is intended to represent data // that largely consists of octets that correspond to printable // characters in the ASCII character set. It encodes the data in // such a way that the resulting octets are unlikely to be modified // by mail transport. If the data being encoded are mostly ASCII // text, the encoded form of the data remains largely recognizable // by humans. A body which is entirely ASCII may also be encoded // in Quoted-Printable to ensure the integrity of the data should // the message pass through a character- translating, and/or // line-wrapping gateway. // // In this encoding, octets are to be represented as determined by // the following rules: // // Rule #1: (General 8-bit representation) Any octet, except those // indicating a line break according to the newline convention // of the canonical (standard) form of the data being encoded, // may be represented by an "=" followed by a two digit hexadecimal // representation of the octet's value. The digits of the hexadecimal // alphabet, for this purpose, are "0123456789ABCDEF". Uppercase // letters must be used when sending hexadecimal data, though a // robust implementation may choose to recognize lowercase letters // on receipt. Thus, for example, the value 12 (ASCII form feed) // can be represented by "=0C", and the value 61 (ASCII EQUAL SIGN) // can be represented by "=3D". Except when the following rules // allow an alternative encoding, this rule is mandatory. // // Rule #2: (Literal representation) Octets with decimal values of // 33 through 60 inclusive, and 62 through 126, inclusive, MAY be // represented as the ASCII characters which correspond to those // octets (EXCLAMATION POINT through LESS THAN, and GREATER THAN // through TILDE, respectively). // // Rule #3: (White Space): Octets with values of 9 and 32 MAY be // represented as ASCII TAB (HT) and SPACE characters, respectively, // but MUST NOT be so represented at the end of an encoded line. Any // TAB (HT) or SPACE characters on an encoded line MUST thus be // followed on that line by a printable character. In particular, // an "=" at the end of an encoded line, indicating a soft line // break (see rule #5) may follow one or more TAB (HT) or SPACE // characters. It follows that an octet with value 9 or 32 appearing // at the end of an encoded line must be represented according // to Rule #1. This rule is necessary because some MTAs (Message // Transport Agents, programs which transport messages from one // user to another, or perform a part of such transfers) are known // to pad lines of text with SPACEs, and others are known to remove // "white space" characters from the end of a line. Therefore, when // decoding a Quoted-Printable body, any trailing white space on // a line must be deleted, as it will necessarily have been added // by intermediate transport agents. // // Rule #4 (Line Breaks): A line break in a text body, independent of // what its representation is following the canonical representation // of the data being encoded, must be represented by a (RFC 822) // line break, which is a CRLF sequence, in the Quoted-Printable // encoding. Since the canonical representation of types other than // text do not generally include the representation of line breaks, // no hard line breaks (i.e. line breaks that are intended to // be meaningful and to be displayed to the user) should occur // in the quoted-printable encoding of such types. Of course, // occurrences of "=0D", "=0A", "0A=0D" and "=0D=0A" will eventually // be encountered. In general, however, base64 is preferred over // quoted-printable for binary data. // // Note that many implementations may elect to encode the local // representation of various content types directly, as described // in Appendix G. In particular, this may apply to plain text // material on systems that use newline conventions other than // CRLF delimiters. Such an implementation is permissible, but the // generation of line breaks must be generalized to account for // the case where alternate representations of newline sequences // are used. // // Rule #5 (Soft Line Breaks): The Quoted-Printable encoding REQUIRES // that encoded lines be no more than 76 characters long. If longer // lines are to be encoded with the Quoted-Printable encoding, // 'soft' line breaks must be used. An equal sign as the last // character on a encoded line indicates such a non-significant // ('soft') line break in the encoded text. Thus if the "raw" // form of the line is a single unencoded line that says: // // Now's the time for all folk to come to the aid of their country. // // This can be represented, in the Quoted-Printable encoding, as // // Now's the time = // for all folk to come = // to the aid of their country. // // This provides a mechanism with which long lines are encoded in // such a way as to be restored by the user agent. The 76 character // limit does not count the trailing CRLF, but counts all other // characters, including any equal signs. // // Since the hyphen character ("-") is represented as itself // in the Quoted-Printable encoding, care must be taken, when // encapsulating a quoted-printable encoded body in a multipart // entity, to ensure that the encapsulation boundary does not // appear anywhere in the encoded body. (A good strategy is to // choose a boundary that includes a character sequence such as // "=_" which can never appear in a quoted- printable body. See // the definition of multipart messages later in this document.) // // NOTE: The quoted-printable encoding represents something // of a compromise between readability and reliability in // transport. Bodies encoded with the quoted-printable encoding // will work reliably over most mail gateways, but may not // work perfectly over a few gateways, notably those involving // translation into EBCDIC. (In theory, an EBCDIC gateway could // decode a quoted-printable body and re-encode it using base64, // but such gateways do not yet exist.) A higher level of confidence // is offered by the base64 Content-Transfer-Encoding. A way to get // reasonably reliable transport through EBCDIC gateways is to also // quote the ASCII characters // // !"#$@[\]^`{|}~ // // according to rule #1. See Appendix B for more information. // // Because quoted-printable data is generally assumed to be line- // oriented, it is to be expected that the representation of // the breaks between the lines of quoted printable data may // be altered in transport, in the same manner that plain text // mail has always been altered in Internet mail when passing // between systems with differing newline conventions. If such // alterations are likely to constitute a corruption of the data, // it is probably more sensible to use the base64 encoding rather // than the quoted-printable encoding. // // WARNING TO IMPLEMENTORS: If binary data are encoded in quoted- // printable, care must be taken to encode CR and LF characters as // "=0D" and "=0A", respectively. In particular, a CRLF sequence in // binary data should be encoded as "=0D=0A". Otherwise, if CRLF // were represented as a hard line break, it might be incorrectly // decoded on platforms with different line break conventions. // // For formalists, the syntax of quoted-printable data is described // by the following grammar: // // quoted-printable := ([*(ptext / SPACE / TAB) ptext] ["="] CRLF) // ; Maximum line length of 76 characters excluding CRLF // // ptext := octet / 127, =, SPACE, or TAB, // ; and is recommended for any characters not listed in // ; Appendix B as "mail-safe". // #include #include #include input_quoted_printable::~input_quoted_printable() { } input_quoted_printable::input_quoted_printable(input &arg) : deeper(arg), eof(false), pos(0) { } static int hex(int c) { switch (c) { default: return -1; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return (c - '0'); case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': return (c - 'A' + 10); case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': return (c - 'a' + 10); } } long input_quoted_printable::read_inner(void *data, size_t len) { if (eof) return 0; unsigned char *cp = (unsigned char *)data; unsigned char *end = cp + len; next_char: while (cp < end) { int c = deeper->getch(); if (c < 0) { eof = true; break; } if (c == ' ' || c == '\t') { static char *buf; static size_t bufmax; // // We are supposed to suppress white space on // the ends of lines. This is because some // (non-unix, non-windows) mail transfer agents // add extra white space on the ends of lines. // (Our corresponding encoding escapes trailing // spaces and tabs.) // size_t bufpos = 0; for (;;) { // // Stash this character (we may need it later) // if (bufpos >= bufmax) { bufmax = bufmax * 2 + 8; char *new_buf = new char [bufmax]; memcpy(new_buf, buf, bufpos); delete [] buf; buf = new_buf; } buf[bufpos++] = c; // // See what comes next. // c = deeper->getch(); if (c < 0) break; if (c == '\n') { *cp++ = '\n'; goto next_char; } if (c != ' ' && c != '\t') { deeper->ungetc(c); break; } } // // Put as many of the buffered characters into // the output as possible. This means we won't // double handle them (actually, we would O(n**2) // handle them). // size_t nchars = end - cp; if (nchars > bufpos) nchars = bufpos; memcpy(cp, buf, nchars); cp += nchars; // // If there wasn't room, there is no help for it. // We will have to give the rest of the buffered // characters back. Hopefully next time will // be big enough for all of them. The // pathological case required >16KB of spaces and // tabs: unlikely. // while (bufpos > nchars) { --bufpos; deeper->ungetc(buf[bufpos]); } // // Don't fall into the next statement, but start // this loop from the top. (We could have run // out of output buf). // continue; } // // If this isn't an escape sequence, return the literal // character. // if (c != '=') { *cp++ = c; continue; } // // Grab two hex digits. If they aren't hex digits, // it is a format error. // // Except for trailing white space; that we ignore. // c = deeper->getch(); if (c < 0) break; if (c == ' ' || c == '\t') { for (;;) { c = deeper->getch(); if (c == '\n') break; if (c != ' ' && c != '\t') { deeper->fatal_error("quoted printable: invalid character"); // NOTREACHED } } } if (c == '\n') continue; int n1 = hex(c); if (n1 < 0) { deeper->fatal_error("quoted printable: invalid hex character"); // NOTREACHED } c = deeper->getch(); int n2 = hex(c); if (n2 < 0) { deeper->fatal_error("quoted printable: invalid hex character"); // NOTREACHED } *cp++ = ((n1 << 4) | n2); } long nbytes = (cp - (unsigned char *)data); pos += nbytes; return nbytes; } long input_quoted_printable::ftell_inner() { return pos; } nstring input_quoted_printable::name() { return deeper->name(); } long input_quoted_printable::length() { return -1; } void input_quoted_printable::keepalive() { deeper->keepalive(); } bool input_quoted_printable::is_remote() const { return deeper->is_remote(); }