//
// aegis - project change supervisor
// Copyright (C) 2001-2006, 2008 Peter Miller
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see
// .
//
// From RFC 1521...
//
// The Quoted-Printable encoding is intended to represent data
// that largely consists of octets that correspond to printable
// characters in the ASCII character set. It encodes the data in
// such a way that the resulting octets are unlikely to be modified
// by mail transport. If the data being encoded are mostly ASCII
// text, the encoded form of the data remains largely recognizable
// by humans. A body which is entirely ASCII may also be encoded
// in Quoted-Printable to ensure the integrity of the data should
// the message pass through a character- translating, and/or
// line-wrapping gateway.
//
// In this encoding, octets are to be represented as determined by
// the following rules:
//
// Rule #1: (General 8-bit representation) Any octet, except those
// indicating a line break according to the newline convention
// of the canonical (standard) form of the data being encoded,
// may be represented by an "=" followed by a two digit hexadecimal
// representation of the octet's value. The digits of the hexadecimal
// alphabet, for this purpose, are "0123456789ABCDEF". Uppercase
// letters must be used when sending hexadecimal data, though a
// robust implementation may choose to recognize lowercase letters
// on receipt. Thus, for example, the value 12 (ASCII form feed)
// can be represented by "=0C", and the value 61 (ASCII EQUAL SIGN)
// can be represented by "=3D". Except when the following rules
// allow an alternative encoding, this rule is mandatory.
//
// Rule #2: (Literal representation) Octets with decimal values of
// 33 through 60 inclusive, and 62 through 126, inclusive, MAY be
// represented as the ASCII characters which correspond to those
// octets (EXCLAMATION POINT through LESS THAN, and GREATER THAN
// through TILDE, respectively).
//
// Rule #3: (White Space): Octets with values of 9 and 32 MAY be
// represented as ASCII TAB (HT) and SPACE characters, respectively,
// but MUST NOT be so represented at the end of an encoded line. Any
// TAB (HT) or SPACE characters on an encoded line MUST thus be
// followed on that line by a printable character. In particular,
// an "=" at the end of an encoded line, indicating a soft line
// break (see rule #5) may follow one or more TAB (HT) or SPACE
// characters. It follows that an octet with value 9 or 32 appearing
// at the end of an encoded line must be represented according
// to Rule #1. This rule is necessary because some MTAs (Message
// Transport Agents, programs which transport messages from one
// user to another, or perform a part of such transfers) are known
// to pad lines of text with SPACEs, and others are known to remove
// "white space" characters from the end of a line. Therefore, when
// decoding a Quoted-Printable body, any trailing white space on
// a line must be deleted, as it will necessarily have been added
// by intermediate transport agents.
//
// Rule #4 (Line Breaks): A line break in a text body, independent of
// what its representation is following the canonical representation
// of the data being encoded, must be represented by a (RFC 822)
// line break, which is a CRLF sequence, in the Quoted-Printable
// encoding. Since the canonical representation of types other than
// text do not generally include the representation of line breaks,
// no hard line breaks (i.e. line breaks that are intended to
// be meaningful and to be displayed to the user) should occur
// in the quoted-printable encoding of such types. Of course,
// occurrences of "=0D", "=0A", "0A=0D" and "=0D=0A" will eventually
// be encountered. In general, however, base64 is preferred over
// quoted-printable for binary data.
//
// Note that many implementations may elect to encode the local
// representation of various content types directly, as described
// in Appendix G. In particular, this may apply to plain text
// material on systems that use newline conventions other than
// CRLF delimiters. Such an implementation is permissible, but the
// generation of line breaks must be generalized to account for
// the case where alternate representations of newline sequences
// are used.
//
// Rule #5 (Soft Line Breaks): The Quoted-Printable encoding REQUIRES
// that encoded lines be no more than 76 characters long. If longer
// lines are to be encoded with the Quoted-Printable encoding,
// 'soft' line breaks must be used. An equal sign as the last
// character on a encoded line indicates such a non-significant
// ('soft') line break in the encoded text. Thus if the "raw"
// form of the line is a single unencoded line that says:
//
// Now's the time for all folk to come to the aid of their country.
//
// This can be represented, in the Quoted-Printable encoding, as
//
// Now's the time =
// for all folk to come =
// to the aid of their country.
//
// This provides a mechanism with which long lines are encoded in
// such a way as to be restored by the user agent. The 76 character
// limit does not count the trailing CRLF, but counts all other
// characters, including any equal signs.
//
// Since the hyphen character ("-") is represented as itself
// in the Quoted-Printable encoding, care must be taken, when
// encapsulating a quoted-printable encoded body in a multipart
// entity, to ensure that the encapsulation boundary does not
// appear anywhere in the encoded body. (A good strategy is to
// choose a boundary that includes a character sequence such as
// "=_" which can never appear in a quoted- printable body. See
// the definition of multipart messages later in this document.)
//
// NOTE: The quoted-printable encoding represents something
// of a compromise between readability and reliability in
// transport. Bodies encoded with the quoted-printable encoding
// will work reliably over most mail gateways, but may not
// work perfectly over a few gateways, notably those involving
// translation into EBCDIC. (In theory, an EBCDIC gateway could
// decode a quoted-printable body and re-encode it using base64,
// but such gateways do not yet exist.) A higher level of confidence
// is offered by the base64 Content-Transfer-Encoding. A way to get
// reasonably reliable transport through EBCDIC gateways is to also
// quote the ASCII characters
//
// !"#$@[\]^`{|}~
//
// according to rule #1. See Appendix B for more information.
//
// Because quoted-printable data is generally assumed to be line-
// oriented, it is to be expected that the representation of
// the breaks between the lines of quoted printable data may
// be altered in transport, in the same manner that plain text
// mail has always been altered in Internet mail when passing
// between systems with differing newline conventions. If such
// alterations are likely to constitute a corruption of the data,
// it is probably more sensible to use the base64 encoding rather
// than the quoted-printable encoding.
//
// WARNING TO IMPLEMENTORS: If binary data are encoded in quoted-
// printable, care must be taken to encode CR and LF characters as
// "=0D" and "=0A", respectively. In particular, a CRLF sequence in
// binary data should be encoded as "=0D=0A". Otherwise, if CRLF
// were represented as a hard line break, it might be incorrectly
// decoded on platforms with different line break conventions.
//
// For formalists, the syntax of quoted-printable data is described
// by the following grammar:
//
// quoted-printable := ([*(ptext / SPACE / TAB) ptext] ["="] CRLF)
// ; Maximum line length of 76 characters excluding CRLF
//
// ptext := octet /
// ; characters not listed as "mail-safe" in Appendix B
// ; are also not recommended.
//
// octet := "=" 2(DIGIT / "A" / "B" / "C" / "D" / "E" / "F")
// ; octet must be used for characters > 127, =, SPACE, or TAB,
// ; and is recommended for any characters not listed in
// ; Appendix B as "mail-safe".
//
#include
#include
#include
input_quoted_printable::~input_quoted_printable()
{
}
input_quoted_printable::input_quoted_printable(input &arg) :
deeper(arg),
eof(false),
pos(0)
{
}
static int
hex(int c)
{
switch (c)
{
default:
return -1;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return (c - '0');
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
return (c - 'A' + 10);
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
return (c - 'a' + 10);
}
}
long
input_quoted_printable::read_inner(void *data, size_t len)
{
if (eof)
return 0;
unsigned char *cp = (unsigned char *)data;
unsigned char *end = cp + len;
next_char:
while (cp < end)
{
int c = deeper->getch();
if (c < 0)
{
eof = true;
break;
}
if (c == ' ' || c == '\t')
{
static char *buf;
static size_t bufmax;
//
// We are supposed to suppress white space on
// the ends of lines. This is because some
// (non-unix, non-windows) mail transfer agents
// add extra white space on the ends of lines.
// (Our corresponding encoding escapes trailing
// spaces and tabs.)
//
size_t bufpos = 0;
for (;;)
{
//
// Stash this character (we may need it later)
//
if (bufpos >= bufmax)
{
bufmax = bufmax * 2 + 8;
char *new_buf = new char [bufmax];
memcpy(new_buf, buf, bufpos);
delete [] buf;
buf = new_buf;
}
buf[bufpos++] = c;
//
// See what comes next.
//
c = deeper->getch();
if (c < 0)
break;
if (c == '\n')
{
*cp++ = '\n';
goto next_char;
}
if (c != ' ' && c != '\t')
{
deeper->ungetc(c);
break;
}
}
//
// Put as many of the buffered characters into
// the output as possible. This means we won't
// double handle them (actually, we would O(n**2)
// handle them).
//
size_t nchars = end - cp;
if (nchars > bufpos)
nchars = bufpos;
memcpy(cp, buf, nchars);
cp += nchars;
//
// If there wasn't room, there is no help for it.
// We will have to give the rest of the buffered
// characters back. Hopefully next time will
// be big enough for all of them. The
// pathological case required >16KB of spaces and
// tabs: unlikely.
//
while (bufpos > nchars)
{
--bufpos;
deeper->ungetc(buf[bufpos]);
}
//
// Don't fall into the next statement, but start
// this loop from the top. (We could have run
// out of output buf).
//
continue;
}
//
// If this isn't an escape sequence, return the literal
// character.
//
if (c != '=')
{
*cp++ = c;
continue;
}
//
// Grab two hex digits. If they aren't hex digits,
// it is a format error.
//
// Except for trailing white space; that we ignore.
//
c = deeper->getch();
if (c < 0)
break;
if (c == ' ' || c == '\t')
{
for (;;)
{
c = deeper->getch();
if (c == '\n')
break;
if (c != ' ' && c != '\t')
{
deeper->fatal_error("quoted printable: invalid character");
// NOTREACHED
}
}
}
if (c == '\n')
continue;
int n1 = hex(c);
if (n1 < 0)
{
deeper->fatal_error("quoted printable: invalid hex character");
// NOTREACHED
}
c = deeper->getch();
int n2 = hex(c);
if (n2 < 0)
{
deeper->fatal_error("quoted printable: invalid hex character");
// NOTREACHED
}
*cp++ = ((n1 << 4) | n2);
}
long nbytes = (cp - (unsigned char *)data);
pos += nbytes;
return nbytes;
}
long
input_quoted_printable::ftell_inner()
{
return pos;
}
nstring
input_quoted_printable::name()
{
return deeper->name();
}
long
input_quoted_printable::length()
{
return -1;
}
void
input_quoted_printable::keepalive()
{
deeper->keepalive();
}
bool
input_quoted_printable::is_remote()
const
{
return deeper->is_remote();
}