/* -*-C-*-
*******************************************************************************
*
* File:         iso_convert.c
* RCS:          iso_convert.c,v 1.8 1998/11/13 01:22:28 tom Exp
* Description:  some iso header conversions.
* Author:       Tom Hageman <tom@basil.icce.rug.nl>
* Created:      Tue Dec 16 18:15:26 1997 (extracted/expanded from appnmail)
* Modified:     
* Language:     C
* Package:      mailapp-utilities
* Status:       Exp.
*
* (C) Copyright 1997, but otherwise this file is perfect freeware.
*
*******************************************************************************
*/

#import <stdio.h>
#import <string.h>
#import <ctype.h>
#import "optutil.h"
#import "iso_convert.h"
#import "re.h"

#import "iso2next.h" /* for NEXTSTEP / OPENSTEP */
#import "iso2mac.h"  /* for Rhapsody(DR2) / MacOS X */

/* Tables in there are expected to be in the "recode --header" format. */
/* XXX maybe we'd better use "recode --header --strict"? */

/* Conversion table for `usascii' charset.
   Generated mechanically by GNU recode 3.4.

   WARNING: Hand-hacked by to make it strict.
*/
static unsigned char const usascii[256] =
  {
      0,   1,   2,   3,   4,   5,   6,   7,	/*   0 -   7 */
      8,   9,  10,  11,  12,  13,  14,  15,	/*   8 -  15 */
     16,  17,  18,  19,  20,  21,  22,  23,	/*  16 -  23 */
     24,  25,  26,  27,  28,  29,  30,  31,	/*  24 -  31 */
     32,  33,  34,  35,  36,  37,  38,  39,	/*  32 -  39 */
     40,  41,  42,  43,  44,  45,  46,  47,	/*  40 -  47 */
     48,  49,  50,  51,  52,  53,  54,  55,	/*  48 -  55 */
     56,  57,  58,  59,  60,  61,  62,  63,	/*  56 -  63 */
     64,  65,  66,  67,  68,  69,  70,  71,	/*  64 -  71 */
     72,  73,  74,  75,  76,  77,  78,  79,	/*  72 -  79 */
     80,  81,  82,  83,  84,  85,  86,  87,	/*  80 -  87 */
     88,  89,  90,  91,  92,  93, 195,  95,	/*  88 -  95 */
    193,  97,  98,  99, 100, 101, 102, 103,	/*  96 - 103 */
    104, 105, 106, 107, 108, 109, 110, 111,	/* 104 - 111 */
    112, 113, 114, 115, 116, 117, 118, 119,	/* 112 - 119 */
    120, 121, 122, 123, 124, 125, 126, 127,	/* 120 - 127 */
    /* remainder is zero. */
  };


size_t decode_quoted_printable(char *buf, const char *str, size_t len, int *err)
{
   register char *d = buf;
   register const char *s = str, *e = s + len;
   int errors = 0;

   while (s < e)
   {
      if (*s == '=')
      {
	 if (s >= e - 2) ++errors;
	 else
	 {
	    //  parse "=XX" where XX is a 2-digit hex number
	    unsigned c = 0;

	    sscanf(s+1, "%2x", &c);
	    if (c >= 256) ++errors;
	    else
	    {
	       s += 3;
	       *d++ = c;
	       continue;
	    }
	 }
      }
      else if (*s == '_')  /* special-case for Q-P headers [rfc2047 4.2(2)] */
      {
	 s += 1;
	 *d++ = '\x20';
	 continue;
      }
      *d++ = *s++;
   }
   if (err) (*err) += errors;
   return (d - buf);
}

size_t decode_base64(char *buf, const char *str, size_t len, int *err)
{
   static char b64[256] = {0}, initialized = 0; 
   register char *d = buf;
   register const char *s = str, *e = s + len;
   int errors = 0;
   int i;
   char b[4];

   if (!initialized)
   {
      static const unsigned char c64[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

      for (i = 0;  i < sizeof(b64);  i++) b64[i] = -1;
      for (i = 0;  i < 64;  i++) b64[c64[i]] = i;
      ++initialized;
   }

   i = 0;
   /* Ignore `=' padding at the end. */
   while (s < e && e[-1] == '=') --e;

   while (s < e)
   {
      if ((signed)(b[i++] = b64[(unsigned char)*s++]) < 0) ++errors;

      if ((i &= 3) == 0)
      {
	 *d++ = (b[0] << 2) | (b[1] >> 4);
	 *d++ = (b[1] << 4) | (b[2] >> 2);
	 *d++ = (b[2] << 6) | (b[3]);
      }
   }
   switch (i)
   {
   case 0:
      break;
   case 1:
      *d++ = (b[0] << 2);
      break;
   case 2:
      *d++ = (b[0] << 2) | (b[1] >> 4);
      *d++ = (b[1] << 4);
      break;
   case 3:
      *d++ = (b[0] << 2) | (b[1] >> 4);
      *d++ = (b[1] << 4) | (b[2] >> 2);
      *d++ = (b[2] << 6);
      break;
   }
   if (err) (*err) += errors;
   return (d - buf);
}

static size_t convert_table(char *buf, const char *str, size_t len, int *err, const unsigned char *table)
{
   register char *d = buf;
   register const unsigned char *s = str, *e = s + len;
   int errors = 0;

   /* Ignore null-characters at end. */
   while (s < e && e[-1] == '\0') --e;

   while (s < e)
   {
      if ((*d++ = table[*s++]) == '\0')
      {
	 ++errors;
	 d[-1] = '?';	/* Avoid null characters in string. */
      }
   }
   if (err) (*err) += errors;
   return (d - buf);
}


#define KANJI_IN	0
#define KANJI_OUT	!(KANJI_IN)

#define ESC '\033'

static size_t convert_jis2euc(char *buf, const char *str, size_t len, int *err, const unsigned char *table)
{
   static const struct
   {
      const char *seq;
      int flag;
      int len;
   }
   kanji[] = {
      { "$B", KANJI_IN, 2 },
      { "$@", KANJI_IN, 2 },
      { "(B", KANJI_OUT, 2 },
      { "(J", KANJI_OUT, 2 },
   };
   int kanji_flag = KANJI_OUT;

   register const char *s = str, *e = s + len;
   register char *d = buf;
   int i;

   /* Ignore null-characters at end. */
   while (s < e && e[-1] == '\0') --e;

   for ( ;s < e; s++)
   {
      if (*s == ESC)
      {
         for (i = 0; i < sizeof(kanji)/sizeof(kanji[0]); i++)
         {
            if (strncmp(s+1, kanji[i].seq, kanji[i].len) == 0)
            {
               kanji_flag = kanji[i].flag;
               s += (kanji[i].len);
               break;
            }
          }
      }
      else
      {
         *d++ = (kanji_flag==KANJI_IN) ? (*s | 0x80) : *s ;
      }
   }
   return (d - buf);
}

static size_t convert_utf8(char *buf, const char *str, size_t len, int *err, const unsigned char *table)
{
   register char *d = buf;
   register const unsigned char *s = str, *e = s + len;
   int errors = 0;

   /* Ignore null-characters at end. */
   while (s < e && e[-1] == '\0') --e;

   while (s < e)
   {
      unsigned char c = *s++;

      if (c < 0x80)
      {
	 /* Simple case. */
	 if ((*d++ = table[c]) == '\0')
	 {
	    ++errors;
	    d[-1] = '?';
	 }
      }
      else
      {
	 /* Convert multi-byte UTF8 to UCS4. */ 
	 static const unsigned char limits[] = {
	    0xC0, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE
	 };
	 const unsigned limitsCount = sizeof(limits) / sizeof(limits[0]);
	 unsigned long code;
	 unsigned i = 0, j;

	 while ((c >= limits[i]) && (++i < limitsCount)) continue;

	 if ((i == 0) || (i == limitsCount) /* invalid initial octet value */ ||
	     ((s + i) > e))		    /* truncated sequence */
	 {
	    /* Skip any trailing [10xxxxxx] octets. */
	    while ((*s - 0x80) < (0xC0 - 0x80) && ++s < e) ;
	    ++errors;
	    *d++ = '?';
	    continue;
	 }

	 code = c - limits[i-1];

	 for (j = 0;  j < i;  j++)
	 {
	    code <<= 6;
	    c = *s++;
	    if ((c - 0x80) >= (0xC0 - 0x80))
	    {
	       --s;
	       ++errors;
	       *d++ = '?';
	       break;
	    }
	    code |= (c - 0x80);
	 }
	 if (j < i) continue; /* error handling. */

	 /* We can only handle 8-bit portion of UCS4, for now. */
	 if (code < 0x100)
	 {
	    if ((*d++ = table[c]) == '\0')
	    {
	       ++errors;
	       d[-1] = '?';
	    }
	 }
	 else
	 {
	    logprintf("(convert_utf8: cannot handle code `%#lx' yet)", code);
	    ++errors;
	    *d++ = '?';
	 }
      }
   }
   if (err) (*err) += errors;
   return (d - buf);
}

typedef size_t (*decode_function)(char *buf, const char *str, size_t len, int *err);
typedef size_t (*convert_function)(char *buf, const char *str, size_t len, int *err, const unsigned char *table);


int iso_convert(char *line, const char *input_encoding, const char *output_encoding)
{
   /* XXX TODO: for OPENSTEP/MacOSXS, should use innate NSString/NSData
      conversion capabilities. */
   static const struct convert_table
   {
      const char *name;
      const unsigned char *contents;
      convert_function convert;
   }
   to_next_table[] = {
      /* The contents of these tables are defined in "iso2next.h".
	 If you add one there, don't forget to add it here too. */
      { "iso-8859-1",	latin1_to_next,	convert_table	},
      { "iso-8859-2",	latin2_to_next,	convert_table	},
      { "iso-2022-jp",	"",		convert_jis2euc	},
      { "us-ascii",	usascii,	convert_table	},
      { "utf-8",	latin1_to_next,	convert_utf8	}
   },
   to_mac_table[] = {
      /* The contents of these tables are defined in "iso2mac.h".
	 If you add one there, don't forget to add it here too. */
      { "iso-8859-1",	latin1_to_mac,	convert_table	},
      { "iso-8859-2",	latin2_to_mac,	convert_table	},
      { "iso-2022-jp",	"",		convert_jis2euc	},
      { "us-ascii",	usascii,	convert_table	},
      { "utf-8",	latin1_to_mac,	convert_utf8	}
   };
   static const struct
   {
      const char *name;
      const struct convert_table *c_table;
      size_t c_table_size;
   }
   e_table[] = {
      { "next",	to_next_table, sizeof(to_next_table)/sizeof(to_next_table[0]) },
      { "mac",	to_mac_table,  sizeof(to_mac_table)/sizeof(to_mac_table[0])   }
   };
   static const struct
   {
      char tag;
      decode_function decode;
   }
   d_table[] = {
      { 'Q', decode_quoted_printable	},
      { 'B', decode_base64		},
   };
   const struct convert_table *c_table = NULL;
   size_t c_table_size = 0;
   convert_function def_convert = NULL;
   const char *def_char_table = NULL;
   static struct regex *isore = 0;
   int errors = 0;
   const char *name = "";
   size_t len;

   if (!isore) isore = re_compile("=?\\([^?]*\\)?\\([A-Za-z]\\)?",0);

   /* Determine conversion table from output encoding. */
   if (output_encoding == NULL) output_encoding = DEF_OUTPUT_ENCODING;
   do
   {
      int i;

      for (i = 0;  i < sizeof(e_table)/sizeof(e_table[0]);  i++)
      {
	 if (strcasecmp(e_table[i].name, output_encoding) == 0)
	 {
	    c_table = e_table[i].c_table;
	    c_table_size = e_table[i].c_table_size;
	    output_encoding = e_table[i].name;
	    break;
	 }
      }
      if (c_table == NULL)
      {
	 logprintf("warning: unsupported output encoding `%s', using `%s' instead.", output_encoding, DEF_OUTPUT_ENCODING);
	 output_encoding = DEF_OUTPUT_ENCODING;
      }
   }
   while (c_table == NULL);

   /* Determine default conversion function/table from input_encoding */
   if (input_encoding == NULL) input_encoding = DEF_INPUT_ENCODING;
   do
   {
      int i;

      for (i = 0;  i < c_table_size;  i++)
      {
	 if (strcasecmp(c_table[i].name, input_encoding) == 0)
	 {
	    def_convert = c_table[i].convert;
	    def_char_table = c_table[i].contents;
	    input_encoding = c_table[i].name;
	    break;
	 }
      }
      if (def_convert == NULL)
      {
	 logprintf("warning: unsupported default input encoding `%s', using `%s' instead.", input_encoding, DEF_INPUT_ENCODING);
	 input_encoding = DEF_INPUT_ENCODING;
      }
   }
   while (def_convert == NULL);


   /* Convert. */
   while (re_match(line, isore) > 0)
   {
      const unsigned char *tt = NULL;
      convert_function convert = NULL;
      decode_function decode = NULL;
      int i, namelen;
      char coding = isore->braslist[1][0];
      char *s, *t, *p;

      name = isore->braslist[0];
      namelen = (isore->braelist[0] - isore->braslist[0]);
      for (i = 0;  i < c_table_size;  i++)
      {
	 if (strncasecmp(c_table[i].name, name, namelen) == 0 &&
	     c_table[i].name[namelen] == 0)
	 {
	    convert = c_table[i].convert;
	    name = c_table[i].name;
	    tt = c_table[i].contents;
	    break;
	 }
      }
      for (i = 0;  i < sizeof(d_table)/sizeof(d_table[0]);  i++)
      {
	 if (d_table[i].tag == toupper(coding))
	 {
	    decode = d_table[i].decode;
	    break;
	 }
      }

      s = (char *)isore->end;

      if ((t = strstr(s, "?=")) == NULL)
      {
	 logprintf("warning: Missing terminating `?=' in MIME 8-bit header `%s'", line);
	 len = strlen(s);
      }
      else
      {
	 len = t - s;
      }

      /* Ignore whitespace between `encoded-words' [RFC2047 6.2] */
      for (p = line;  p < isore->start && isspace(*p);  p++) ;

      if (p < isore->start)
      {
	 /* Assume default encoding outside match. */
	 line += (*def_convert)(line, line, (isore->start - line), &errors, def_char_table);
      }

      if (tt == NULL)
      {
	 logprintf("warning: MIME 8-bit header encoding `%.*s' is unsupported", namelen, name);
	 if (line < isore->start) strcpy(line, isore->start);
	 len += s - isore->start;
      }
      else if (decode == NULL)
      {
	 logprintf("warning: MIME header transfer-encoding `%c' is unsupported", coding);
	 if (line < isore->start) strcpy(line, isore->start);
	 len += s - isore->start;
      }
      else
      {
	 len = (*decode)(line, s, len, &errors);
	 len = (*convert)(line, line, len, &errors, tt);
	 strcpy(line + len, t ? t  + 2 : "");
      }
      line = line + len;
   }

   /* Assume default encoding outside match. */
   len = (*def_convert)(line, line, strlen(line), &errors, def_char_table);
   line[len] = '\0';

   if (errors > 0)
   {
      logprintf("warning: MIME 8-bit header encoding `%s' incomplete conversion", name);
      return -1;
   }
   return 0;
}
