/*
 *	aegis - project change supervisor
 *	Copyright (C) 2001 Peter Miller;
 *	All rights reserved.
 *
 *	This program is free software; you can redistribute it and/or modify
 *	it under the terms of the GNU General Public License as published by
 *	the Free Software Foundation; either version 2 of the License, or
 *	(at your option) any later version.
 *
 *	This program is distributed in the hope that it will be useful,
 *	but WITHOUT ANY WARRANTY; without even the implied warranty of
 *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *	GNU General Public License for more details.
 *
 *	You should have received a copy of the GNU General Public License
 *	along with this program; if not, write to the Free Software
 *	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
 *
 * MANIFEST: functions to manipulate quoted_prints
 *
 * From RFC 1521...
 *
 *	The Quoted-Printable encoding is intended to represent data
 *	that largely consists of octets that correspond to printable
 *	characters in the ASCII character set. It encodes the data in
 *	such a way that the resulting octets are unlikely to be modified
 *	by mail transport. If the data being encoded are mostly ASCII
 *	text, the encoded form of the data remains largely recognizable
 *	by humans. A body which is entirely ASCII may also be encoded
 *	in Quoted-Printable to ensure the integrity of the data should
 *	the message pass through a character- translating, and/or
 *	line-wrapping gateway.
 *
 *	In this encoding, octets are to be represented as determined by
 *	the following rules:
 *
 *	Rule #1: (General 8-bit representation) Any octet, except those
 *	indicating a line break according to the newline convention
 *	of the canonical (standard) form of the data being encoded,
 *	may be represented by an "=" followed by a two digit hexadecimal
 *	representation of the octet's value. The digits of the hexadecimal
 *	alphabet, for this purpose, are "0123456789ABCDEF". Uppercase
 *	letters must be used when sending hexadecimal data, though a
 *	robust implementation may choose to recognize lowercase letters
 *	on receipt. Thus, for example, the value 12 (ASCII form feed)
 *	can be represented by "=0C", and the value 61 (ASCII EQUAL SIGN)
 *	can be represented by "=3D". Except when the following rules
 *	allow an alternative encoding, this rule is mandatory.
 *
 *	Rule #2: (Literal representation) Octets with decimal values of
 *	33 through 60 inclusive, and 62 through 126, inclusive, MAY be
 *	represented as the ASCII characters which correspond to those
 *	octets (EXCLAMATION POINT through LESS THAN, and GREATER THAN
 *	through TILDE, respectively).
 *
 *	Rule #3: (White Space): Octets with values of 9 and 32 MAY be
 *	represented as ASCII TAB (HT) and SPACE characters, respectively,
 *	but MUST NOT be so represented at the end of an encoded line. Any
 *	TAB (HT) or SPACE characters on an encoded line MUST thus be
 *	followed on that line by a printable character. In particular,
 *	an "=" at the end of an encoded line, indicating a soft line
 *	break (see rule #5) may follow one or more TAB (HT) or SPACE
 *	characters. It follows that an octet with value 9 or 32 appearing
 *	at the end of an encoded line must be represented according
 *	to Rule #1. This rule is necessary because some MTAs (Message
 *	Transport Agents, programs which transport messages from one
 *	user to another, or perform a part of such transfers) are known
 *	to pad lines of text with SPACEs, and others are known to remove
 *	"white space" characters from the end of a line. Therefore, when
 *	decoding a Quoted-Printable body, any trailing white space on
 *	a line must be deleted, as it will necessarily have been added
 *	by intermediate transport agents.
 *
 *	Rule #4 (Line Breaks): A line break in a text body, independent of
 *	what its representation is following the canonical representation
 *	of the data being encoded, must be represented by a (RFC 822)
 *	line break, which is a CRLF sequence, in the Quoted-Printable
 *	encoding. Since the canonical representation of types other than
 *	text do not generally include the representation of line breaks,
 *	no hard line breaks (i.e.  line breaks that are intended to
 *	be meaningful and to be displayed to the user) should occur
 *	in the quoted-printable encoding of such types. Of course,
 *	occurrences of "=0D", "=0A", "0A=0D" and "=0D=0A" will eventually
 *	be encountered. In general, however, base64 is preferred over
 *	quoted-printable for binary data.
 *
 *	Note that many implementations may elect to encode the local
 *	representation of various content types directly, as described
 *	in Appendix G. In particular, this may apply to plain text
 *	material on systems that use newline conventions other than
 *	CRLF delimiters. Such an implementation is permissible, but the
 *	generation of line breaks must be generalized to account for
 *	the case where alternate representations of newline sequences
 *	are used.
 *
 *	Rule #5 (Soft Line Breaks): The Quoted-Printable encoding REQUIRES
 *	that encoded lines be no more than 76 characters long. If longer
 *	lines are to be encoded with the Quoted-Printable encoding,
 *	'soft' line breaks must be used. An equal sign as the last
 *	character on a encoded line indicates such a non-significant
 *	('soft') line break in the encoded text. Thus if the "raw"
 *	form of the line is a single unencoded line that says:
 *
 *		Now's the time for all folk to come to the aid of their country.
 *
 *	This can be represented, in the Quoted-Printable encoding, as 
 *
 *		Now's the time = 
 *		for all folk to come = 
 *		to the aid of their country. 
 *
 *	This provides a mechanism with which long lines are encoded in
 *	such a way as to be restored by the user agent. The 76 character
 *	limit does not count the trailing CRLF, but counts all other
 *	characters, including any equal signs.
 *
 *	Since the hyphen character ("-") is represented as itself
 *	in the Quoted-Printable encoding, care must be taken, when
 *	encapsulating a quoted-printable encoded body in a multipart
 *	entity, to ensure that the encapsulation boundary does not
 *	appear anywhere in the encoded body. (A good strategy is to
 *	choose a boundary that includes a character sequence such as
 *	"=_" which can never appear in a quoted- printable body. See
 *	the definition of multipart messages later in this document.)
 *
 *	NOTE: The quoted-printable encoding represents something
 *	of a compromise between readability and reliability in
 *	transport. Bodies encoded with the quoted-printable encoding
 *	will work reliably over most mail gateways, but may not
 *	work perfectly over a few gateways, notably those involving
 *	translation into EBCDIC. (In theory, an EBCDIC gateway could
 *	decode a quoted-printable body and re-encode it using base64,
 *	but such gateways do not yet exist.) A higher level of confidence
 *	is offered by the base64 Content-Transfer-Encoding. A way to get
 *	reasonably reliable transport through EBCDIC gateways is to also
 *	quote the ASCII characters
 *
 *		!"#$@[\]^`{|}~ 
 *
 *	according to rule #1. See Appendix B for more information. 
 *
 *	Because quoted-printable data is generally assumed to be line-
 *	oriented, it is to be expected that the representation of
 *	the breaks between the lines of quoted printable data may
 *	be altered in transport, in the same manner that plain text
 *	mail has always been altered in Internet mail when passing
 *	between systems with differing newline conventions. If such
 *	alterations are likely to constitute a corruption of the data,
 *	it is probably more sensible to use the base64 encoding rather
 *	than the quoted-printable encoding.
 *
 *	WARNING TO IMPLEMENTORS: If binary data are encoded in quoted-
 *	printable, care must be taken to encode CR and LF characters as
 *	"=0D" and "=0A", respectively. In particular, a CRLF sequence in
 *	binary data should be encoded as "=0D=0A". Otherwise, if CRLF
 *	were represented as a hard line break, it might be incorrectly
 *	decoded on platforms with different line break conventions.
 *
 *	For formalists, the syntax of quoted-printable data is described
 *	by the following grammar:
 *	
 *	quoted-printable := ([*(ptext / SPACE / TAB) ptext] ["="] CRLF)
 *		; Maximum line length of 76 characters excluding CRLF
 *
 *	ptext := octet /<any ASCII character except "=", SPACE, or TAB>
 *		; characters not listed as "mail-safe" in Appendix B
 *		; are also not recommended.
 *
 *	octet := "=" 2(DIGIT / "A" / "B" / "C" / "D" / "E" / "F")
 *		; octet must be used for characters > 127, =, SPACE, or TAB,
 *		; and is recommended for any characters not listed in
 *		; Appendix B as "mail-safe".
 */

#include <ac/string.h>

#include <input/quoted_print.h>
#include <input/private.h>
#include <mem.h>


typedef struct input_base64_ty input_base64_ty;
struct input_base64_ty
{
	input_ty	inherited;
	input_ty	*deeper;
	int		close_on_close;
	int		eof;
	long		pos;
};


static void input_quoted_printable_destructor _((input_ty *));

static void
input_quoted_printable_destructor(fp)
	input_ty	*fp;
{
	input_base64_ty	*this;

	this = (input_base64_ty *)fp;
	if (this->close_on_close)
		input_delete(this->deeper);
	this->deeper = 0; /* paranoia */
}


static int hex _((int));

static int
hex(c)
	int	c;
{
	switch (c)
	{
	default:
		return -1;

	case '0': case '1': case '2': case '3': case '4':
	case '5': case '6': case '7': case '8': case '9': 
		return (c - '0');

	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 
		return (c - 'A' + 10);

	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 
		return (c - 'a' + 10);
	}
}


static long input_quoted_printable_read _((input_ty *, void *, size_t));

static long
input_quoted_printable_read(fp, data, len)
	input_ty	*fp;
	void		*data;
	size_t		len;
{
	input_base64_ty	*this;
	unsigned char	*cp;
	unsigned char	*end;
	long		nbytes;

	this = (input_base64_ty *)fp;
	if (this->eof)
		return 0;
	cp = data;
	end = cp + len;
	while (cp < end)
	{
		int	c;
		int	n1, n2;

		c = input_getc(this->deeper);
		if (c < 0)
		{
			this->eof = 1;
			break;
		}
		if (c == ' ' || c == '\t')
		{
			static char	*buffer;
			static size_t	bufmax;
			size_t		bufpos;
			size_t		nchars;

			/*
			 * We are supposed to suppress white space on
			 * the ends of lines.  This is because some
			 * (non-unix, non-windows) mail transfer agents
			 * add extra white space on the ends of lines.
			 * (Our corresponding encoding escapes trailing
			 * spaces and tabs.)
			 */
			bufpos = 0;
			for (;;)
			{
				/*
				 * Stash this character (we may need it later)
				 */
				if (bufpos >= bufmax)
				{
					bufmax = bufmax * 2 + 8;
					buffer =
						mem_change_size(buffer, bufmax);
				}
				buffer[bufpos++] = c;

				/*
				 * See what comes next.
				 */
				c = input_getc(this->deeper);
				if (c < 0)
					break;
				if (c == '\n')
				{
					*cp++ = '\n';
					goto next_char;
				}
				if (c != ' ' && c != '\t')
				{
					input_ungetc(this->deeper, c);
					break;
				}
			}

			/*
			 * Put as many of the buffered characters into
			 * the output as possible.  This means we won't
			 * double handle them (actually, we would O(n**2)
			 * handle them).
			 */
			nchars = end - cp;
			if (nchars > bufpos)
				nchars = bufpos;
			memcpy(cp, buffer, nchars);
			cp += nchars;

			/*
			 * If there wasn't room, there is no help for it.
			 * We will have to give the rest of the buffered
			 * characters back.  Hopefully next time will
			 * be big enough for all of them.  The
			 * pathological case required >16KB of spaces and
			 * tabs: unlikely.
			 */
			while (bufpos > nchars)
			{
				--bufpos;
				input_ungetc(this->deeper, buffer[bufpos]);
			}

			/*
			 * Don't fall into the next statement, but start
			 * this loop from the top.  (We could have run
			 * out of output buffer).
			 */
			continue;
		}

		/*
		 * If this isn't an escape sequence, return the literal
		 * character.
		 */
		if (c != '=')
		{
			*cp++ = c;
			continue;
		}

		/*
		 * Grab two hex digits.  If they aren't hex digits,
		 * it is a format error.
		 *
		 * Except for trailing white space; that we ignore.
		 */
		c = input_getc(this->deeper);
		if (c < 0)
			break;
		if (c == ' ' || c == '\t')
		{
			for (;;)
			{
				c = input_getc(this->deeper);
				if (c == '\n')
					break;
				if (c != ' ' && c != '\t')
				{
					input_fatal_error(this->deeper, "quoted printable: invalid character");
					/* NOTREACHED */
				}
			}
		}
		if (c == '\n')
			continue;
		n1 = hex(c);
		if (n1 < 0)
		{
			input_fatal_error(fp, "quoted printable: invalid hex character");
			/* NOTREACHED */
		}
		c = input_getc(this->deeper);
		n2 = hex(c);
		if (n2 < 0)
		{
			input_fatal_error(fp, "quoted printable: invalid hex character");
			/* NOTREACHED */
		}
		*cp++ = ((n1 << 4) | n2);
	next_char:
		;
	}
	nbytes = (cp - (unsigned char *)data);
	this->pos += nbytes;
	return nbytes;
}


static long input_quoted_printable_ftell _((input_ty *));

static long
input_quoted_printable_ftell(deeper)
	input_ty	*deeper;
{
	input_base64_ty	*this;

	this = (input_base64_ty *)deeper;
	return this->pos;
}


static struct string_ty *input_quoted_printable_name _((input_ty *));

static struct string_ty *
input_quoted_printable_name(fp)
	input_ty	*fp;
{
	input_base64_ty	*this;

	this = (input_base64_ty *)fp;
	return input_name(this->deeper);
}


static long input_quoted_printable_length _((input_ty *));

static long
input_quoted_printable_length(fp)
	input_ty	*fp;
{
	return -1;
}


static input_vtbl_ty vtbl =
{
	sizeof(input_base64_ty),
	input_quoted_printable_destructor,
	input_quoted_printable_read,
	input_quoted_printable_ftell,
	input_quoted_printable_name,
	input_quoted_printable_length,
};


input_ty *
input_quoted_printable(deeper, coc)
	input_ty	*deeper;
	int		coc;
{
	input_ty	*result;
	input_base64_ty	*this;

	result = input_new(&vtbl);
	this = (input_base64_ty *)result;
	this->deeper = deeper;
	this->close_on_close = coc;
	this->eof = 0;
	this->pos = 0;
	return result;
}