/*  File   : output.c
    Author : Richard A. O'Keefe
    Updated: 07/20/01
    Purpose: Output functions for qh.
*/
#include <unistd.h>
#include <string.h>
#include "qh.h"

#define OUTBUF_SIZE (64*1024)

static wchar outbuf[OUTBUF_SIZE];
static wchar *op = outbuf;

static wchar *
oflush(wchar *e) {
    ssize_t m, n = e-outbuf;
    unsigned char *s;
#ifdef WIDE_CHARS
    wchar *p = outbuf;

    s = (unsigned char *)outbuf;
    while (p != e) *s++ = *p++;
#endif
    s = (unsigned char *)outbuf;
    m = write(STDOUT_FILENO, s, n);
    while (m < n) {
	if (m < 0) quit("output error");
	s += m;
	n -= m;
	m = write(STDOUT_FILENO, s, n);
    }
    return outbuf;
}


/*  write_char('.');
    writes a single ASCII character; does no escaping.
*/
static void
write_char(char c) {
    wchar *d = op, *e = &outbuf[OUTBUF_SIZE];

    if (d == e) d = oflush(d);
    *d++ = c;
    op = d;
}


/*  write_chars("...")
    writes a sequence of ASCII characters; does no escaping.
*/
static void
write_chars(char const *p) {
    char c;
    wchar *d = op, *e = &outbuf[OUTBUF_SIZE];

    while ((c = *p++) != '\0') {
	if (d == e) d = oflush(d);
	*d++ = c;
    }
    op = d;
}


/*  write_text(a, [...], z)
    writes a sequence of wchars, with characters that don't fit
    escaped as \ooo or \#dddddd; as appropriate.
    There may be ASCII characters a,z added at either end;
    a NUL character will not be written.
    \#ddddd; is used for characters >= 256;
    \ooo output is used for characters below ' ' or
    (for ASCII output) >= 127.
*/
static void
write_text(wchar a, wchar const *p, wchar z) {
    wchar c, b;
    wchar *d = op, *e = &outbuf[OUTBUF_SIZE];

    if (a != 0) {
	if (d == e) d = oflush(d);
	*d++ = a;
    }
    while ((c = *p++) != '\0') {
	if (d == e) d = oflush(d);
#ifdef WIDE_CHARS
	if (c > 255) {
	    wchar buff[14]; /* \# <10 digits> ; */
	    wchar *t = &buff[sizeof buff - 1];
	    
	    *t-- = '\0';
	    *t-- = ';';
	    do *t-- = c%10 + '0'; while ((c /= 10) > 0);
	    *t-- = '#';
	    *t-- = '\\';
	    while ((c = *t++) != '\0') {
		if (d == e) d = oflush(d);
		*d++ = c;
	    }
	} else
#endif
	if (
#ifdef ASCII_OUTPUT
	    c < ' ' || c >= 127 || c == '\\'
#else
	    c < ' '      ||        c == '\\'
#endif
	) {
	    *d++ = '\\';
	    if (d == e) d = oflush(d);
	    if (c == '\n') c = 'n';
	    if (c >= ' ') {
		*d++ = c;
	    } else {
		b = ((c >> 6) & 7) + '0';
		*d++ = b;
		if (d == e) d = oflush(d);
		b = ((c >> 3) & 7) + '0';
		*d++ = b;
		if (d == e) d = oflush(d);
		b = ((c >> 0) & 7) + '0';
		*d++ = b;
	    }
	} else {
	    *d++ = c;
	}
    }
    if (z != 0) {
	if (d == e) d = oflush(d);
	*d++ = z;
    }
    op = d;
}


/*  write_string(q, p, z)
    writes a sequence of wchars as XML data.
    q is the opening and closing quotation mark, if any (0 means none).
    z is an ASCII character to write afterwards; 0 means none.
    The characters <, &, and q are written as character references;
    characters outside the normal printing range are also written thus.
*/
static void
write_string(wchar q, wchar const *p, wchar z) {
    wchar c;
    wchar *d = op, *e = &outbuf[OUTBUF_SIZE];
    wchar buff[14]; /* &# <10 digits> ; */
    wchar *t;

    if (q != 0) {
	if (d == e) d = oflush(d);
	*d++ = q;
    }
    while ((c = *p++) != '\0') {
	if (
#ifdef ASCII_OUTPUT
	    c > 127 ||
#else
#ifdef WIDE_CHARS
	    c > 255 ||
#endif
#endif
	    (c < ' ' && c != '\n') ||
	    c == '<' || c == '&' || c == q
	) {
	    t = &buff[sizeof buff - 1];
	    *t-- = '\0';
	    *t-- = ';';
	    do *t-- = c%10 + '0'; while ((c /= 10) > 0);
	    *t-- = '#';
	    *t-- = '&';
	    while ((c = *t++) != '\0') {
		if (d == e) d = oflush(d);
		*d++ = c;
	    }
	} else {
#if STRICT_CXML
	    static wchar amp [] = {'&','a','m','p',';','\0'};
	    static wchar lt  [] = {'&','l','t',';','\0'};
	    static wchar gt  [] = {'&','g','t',';','\0'};
	    static wchar quot[] = {'&','q','u','o','t',';','\0'};
	    
	    switch (c) {
	        case '&': t = amp ; break;
	        case '<': t = lt  ; break;
	        case '>': t = gt  ; break;
	        case '"': t = quot; break;
	        default :
		    if (d == e) d = oflush(d);
		    *d++ = c;
		    continue;
	    }
	    while ((c = *t++) != '\0') {
		if (d == e) d = oflush(d);
		*d++ = c;
	    }
#else
	    if (d == e) d = oflush(d);
	    *d++ = c;
#endif
	}
    }
    if (q != 0) {
	if (d == e) d = oflush(d);
	*d++ = q;
    }
    if (z != 0) {
	if (d == e) d = oflush(d);
	*d++ = z;
    }
    op = d;
}


/*  write_tmn(a, [...], t)
    writes a sequence of wchars, with characters that don't fit
    escaped as \c if ASCII punctuation or as \dddddd; otherwise.
    Newline and tab characters are not escaped.
    The ASCII character a will be added at the beginning
    if it is not NUL.
    The `t' argument says whether this is text (t == 1) or not
    (t == 0); this affects whether the ';' and '|' characters
    need escaping (t == 0) or not (t == 1).
*/
static void
write_tmn(char a, wchar const *p, int t) {
    wchar c;
    wchar *d = op, *e = &outbuf[OUTBUF_SIZE];
    static char special[256];

    if (special['{'] == 0) {
	special[';'] = special['|'] = 1;
	special['{'] = special['}'] = special['\\'] = 2;
    }

    if (a != 0) {
	if (d == e) d = oflush(d);
	*d++ = a;
    }
    while ((c = *p++) != '\0') {
#ifdef ASCII_OUTPUT
#define CMAX 127
#else
#define CMAX 255
#endif
	if (c > CMAX || (c < ' ' && c != '\t' && c != '\n')) {
	    wchar buff[13]; /* \ <10 digits> ; */
	    wchar *b = &buff[sizeof buff];
	    wchar x;

	    *--b = '\0';
	    *--b = ';';
	    do *b-- = c%10 + '0'; while ((c /= 10) > 0);
	    *--b = '\\';
	    while ((x = *b++) != '\0') {
	        if (d == e) d = oflush(d);
	        *d++ = x;
	    }
	} else {
	    if (special[c] > t) {
	        if (d == e) d = oflush(d);
	        *d++ = '\\';
	    }
	    if (d == e) d = oflush(d);
	    *d++ = c;
	}
    }	        
    op = d;
}


static void finish_output(void) {
    op = oflush(op);
}

static void finish_with_newline(void) {
    if (op != outbuf && op[-1] != '\n') write_char('\n');
    finish_output();
}

/*  White space handling.
    The proper thing to do is to have a validating parser which
    doesn't report element content white space in the first place.
    XSLT uses a hack where strings that are all white space are
    dropped.  We do that here to, IF requested.  But it really
    belongs in the parser.
*/
static int drop_white_space = 0;
static int is_white_space(wchar const *s) {
    wchar c;

    while ((c = *s) <= ' ' && c != '\0') s++;
    return c == '\0';
}


/*  --------------------------------------------------------------------
    NULL handlers.  These don't write any output.
    Use them when you just want a syntax check.
*/

static void /*ARGSUSED*/
null_begin_end(wchar const *tag, struct ROKXML_binding *atts) {}

static void /*ARGSUSED*/
null_pi(wchar const *target, wchar const *data) {}

static void /*ARGSUSED*/
null_text(wchar const *text) {}

static struct ROKXML_handlers
null_handlers = {
    null_begin_end,
    null_begin_end,
    null_begin_end,
    null_text,
    null_text,
    null_text,
    null_text,
    null_pi,
    null_pi,
    finish_output
};


/*  --------------------------------------------------------------------
    SUM handlers.  -hs will write the gi of each start- or empty- tag.
    -hsw will also write the attributes, one per line, each preceded
    by a tab, AFTER the gi they apply to.  Use this + sort | uniq -c
    or + an AWK script for summarising the use of tags.
*/

static void
sum_begin(wchar const *tag, struct ROKXML_binding *atts) {
    struct ROKXML_binding *b;

    write_text('\0', tag, '\n');
    if (drop_white_space) {
	/* I know, I know, I should have another flag variable. */
	for (b = atts; b != 0; b = b->next)
	    write_text('\0', b->name,  '\n');
    }
}

static struct ROKXML_handlers
sum_handlers = {
    sum_begin,
    null_begin_end,
    sum_begin,
    null_text,
    null_text,
    null_text,
    null_text,
    null_pi,
    null_pi,
    finish_output
};


/*  --------------------------------------------------------------------
    TEXT handlers.  These write out the contents of #PCDATA, CDATA sections,
    and unexpanded entity references (as &foo;).  This is not quite good
    enough for a spelling checker.  See my spell.awk.  There should be a
    separate spell.c using this interface.
*/

static void
text_text(wchar const *text) {
    if (drop_white_space && is_white_space(text)) return;
    write_text('\0', text, '\0');
}

static void
text_entity(wchar const *entity) {
    write_text('&', entity, ';');
}

static struct ROKXML_handlers
text_handlers = {
    null_begin_end,
    null_begin_end,
    null_begin_end,
    text_text,
    text_text,
    text_entity,
    null_text,
    null_pi,
    null_pi,
    finish_with_newline
};


/*  --------------------------------------------------------------------
    ESIS handlers.  These write out a subset of the ESIS that you would
    get from SGMLS or NSGMLS.  They can't write more than they do, as
    that would require processing the document type.
*/

static void
esis_begin(wchar const *tag, struct ROKXML_binding *atts) {
    struct ROKXML_binding *b;

    for (b = atts; b != 0; b = b->next) {
	write_text('A', b->name,  '\0');
        write_chars(" CDATA");
	write_text(' ', b->value, '\n');
    }
    write_text('(', tag, '\n');
}

static void /*ARGSUSED*/
esis_end(wchar const *tag, struct ROKXML_binding *atts) {
    write_text(')', tag, '\n');
}

static void
esis_empty(wchar const *tag, struct ROKXML_binding *atts) {
    esis_begin(tag, atts);
    esis_end(tag, atts);
}

static void
esis_text(wchar const *text) {
    if (drop_white_space && is_white_space(text)) return;
    write_text('-', text, '\n');
}

static void
esis_cdata(wchar const *cdata) {
    write_text('=', cdata, '\n');
}

static void
esis_entity(wchar const *entity) {
    write_text('&', entity, '\n');
}

static void
esis_pi(wchar const *target, wchar const *data) {
    if (data == 0) {
	write_text('?', target, '\n');
    } else {
	write_text('?', target, 0);
	write_text(' ', data, '\n');
    }
}

static void
esis_comment(wchar const *comment) {
    write_text('!', comment, '\n');
}

static struct ROKXML_handlers
esis_handlers = {
    esis_begin,
    esis_end,
    esis_empty,
    esis_text,
    esis_cdata,
    esis_entity,
    esis_comment,
    esis_pi,
    null_pi,
    finish_output
};


/*  --------------------------------------------------------------------
    CXML (Canonical XML) handlers.  These are for writing a document out
    as XML.  This output file can be used with other parsers, notably my
    WBXML decoder.  Some day soon I should write an ESIS reader.

    Real canonical XML uses extremely long lines.
    If you want to use James Clark's test suite, you really need real cxml.
    If you want to look at the output, you want line-broken cxml.
    Define STRICT_CXML to get the strict version.  Perhaps this should be
    a run-time option.
*/

static void
cxml_tag(wchar const *tag, struct ROKXML_binding *atts, char const *end) {
    struct ROKXML_binding *b;

#if STRICT_CXML
    write_text('<', tag, '\0');
    for (b = atts; b != 0; b = b->next) {
	write_text(' ', b->name, '=');
	write_string('"', b->value, '\0');
    }
#else
    write_text('<', tag, '\n');
    for (b = atts; b != 0; b = b->next) {
	write_text('\0', b->name, '=');
	write_string('"', b->value, '\n');
    }
#endif
    write_chars(end);
}


static void
cxml_begin(wchar const *tag, struct ROKXML_binding *atts) {
    cxml_tag(tag, atts, ">");
}

static void /*ARGSUSED*/ 
cxml_end(wchar const *tag, struct ROKXML_binding *atts) {
    write_char('<');
    write_text('/', tag, '\n');
    write_char('>');
}

static void
cxml_empty(wchar const *tag, struct ROKXML_binding *atts) {
    cxml_tag(tag, atts, "/>");
}

static void
cxml_text(wchar const *text) {
    if (drop_white_space && is_white_space(text)) return;
    write_string('\0', text, '\0');
}

static void
cxml_cdata(wchar const *cdata) {
    write_chars("<![CDATA[");
    write_text('\0', cdata, '\0');
    write_chars("]]>");
}

static void
cxml_entity(wchar const *entity) {
    write_text('&', entity, ';');
}
    
static void
cxml_pi(wchar const *target, wchar const *data) {
    write_chars("<?");
    write_text('\0', target, '\0');
    if (data != 0) write_text(' ', data, '\0');
    write_chars("?>");
}

static void
cxml_comment(wchar const *pi) {
    write_chars("<!--");
    write_text('\0', pi, '\0');
    write_chars("--\n>");
}

static struct ROKXML_handlers
cxml_handlers = {
    cxml_begin,
    cxml_end,
    cxml_empty,
    cxml_text,
    cxml_cdata,
    cxml_entity,
    cxml_comment,
    cxml_pi,
    null_pi,
    finish_with_newline
};


/*  --------------------------------------------------------------------
    TMN handlers.  Tiny Markup Notation is a notation I devised to see
    whether it would have been possible to have something with the same
    power as XML but substantially simpler.  It is, and the result is
    more compact, but it isnt' as _much_ more compact as I'd hoped.
*/

static void
tmn_mtbg(wchar const *tag, struct ROKXML_binding *atts, char c) {
    struct ROKXML_binding *b;

    write_tmn('{', tag, 0);
    for (b = atts; b != 0; b = b->next) {
	write_tmn(';', b->name, 0);
	if (strcmp((char *)b->name, (char *)b->value) != 0)
	    write_tmn(' ', b->value, 0);
    }
    write_char(c);
}

static void
tmn_begin(wchar const *tag, struct ROKXML_binding *atts) {
    tmn_mtbg(tag, atts, '|');
}

static void /*ARGSUSED*/
tmn_end(wchar const *tag, struct ROKXML_binding *atts) {
    write_char('}');
}

static void
tmn_empty(wchar const *tag, struct ROKXML_binding *atts) {
    tmn_mtbg(tag, atts, '}');
}

static void
tmn_text(wchar const *text) {
    if (drop_white_space && is_white_space(text)) return;
    write_tmn('\0', text, 1);
}

static void
tmn_cdata(wchar const *text) {
    write_tmn('\0', text, 1);
}

static void
tmn_entity(wchar const *entity) {
    write_tmn('\\', entity, 1);
    write_char(';');
}

static void
tmn_pi(wchar const *target, wchar const *data) {
    write_chars("{p:");
    write_tmn('|', target, 1);
    if (data != 0) write_tmn(' ', data, 1);
    write_char('}');
}

static void
tmn_comment(wchar const *comment) {
    write_chars("{c:");
    write_tmn('|', comment, 1);
    write_char('}');
}

static struct ROKXML_handlers
tmn_handlers = {
    tmn_begin,
    tmn_end,
    tmn_empty,
    tmn_text,
    tmn_cdata,
    tmn_entity,
    tmn_comment,
    tmn_pi,
    null_pi,
    finish_with_newline
};


/*  --------------------------------------------------------------------
    LISP handlers.  These are used to convert XML to a Lisp format that
    Common Lisp and Scheme can both read using (read).  Perhaps Prolog
    should be handled too.

   <lisp document> ::=
      (DOCUMENT [(ROOT . "<name>")] [(PUBLIC . <string>)] [(SYSTEM . <string>)]
        {<element> | <pi>}*)

    <element> ::=
      (<name> {(<name> . <string>)}*
        {<element> | <pi> | <string> | <refeference> | <cdata> | <comment>}*)

    <pi> ::=
      #(PI <name> [<string>])
    <cdata> ::=
      #(CDATA <string>)
    <comment> ::=
      #(COMMENT <string>)

    <reference> ::= <symbol>
    <string> ::= <string> using "..", \\, \"

*/
static int level = 0;

/*  write_lsp() writes out a chunk of Lisp text.
    If newline is true, it terminates the current line and tabs in,
    the indentation being given by level.

    If a prefix is provided, the characters are written verbatim.

    If a name is provided, it is written as a Lisp symbol.
    This *should* mean with appropriate quoting if it is not a well
    formed Lisp symbol, but since there is no quoting that scm
    understands, and since plausible XML names won't need it, that
    isn't done yet.

    If a separator is provided, and there is a string for it to precede,
    the separator is written verbatim.  This doesn't depend on whether
    there is a name or not.  Once it did, then I thought it mustn't, and
    now it doesn't really matter whether it does or not.

    If a string is provided, it is written out surrounded by double
    quotes.  Double quotes and backslashes in the value will be
    protected by backslashes, <<a"b\c"d>> -> "a\"b\\c\"d".

    Finally, if a suffix is provided, it is written verbatim.

    All of this has been squished into one procedure so that we can
    write as fast as possible.  The calls to 'deposit' could be calls
    to 'write_char' with the same meaning, but this way we just drop
    the characters straight into memory, only calling another procedure
    when the buffer is full.
*/
static void write_lsp(
    int          newline,
    char const  *prefix,
    wchar const *name,
    char const  *separator,
    wchar const *string,
    char const  *suffix
) {
    wchar c;
    wchar *d = op, *e = &outbuf[OUTBUF_SIZE];
#   define deposit(c) (d == e ? d = oflush(d) : (wchar *)0, *d++ = (c))

    if (newline) {
	int i;

	deposit('\n');
	for (i = 0; i < level; i++) deposit(' ');
    }
    if (prefix != 0)
	while ((c = *prefix++) != '\0') deposit(c);
    if (name != 0)
	while ((c = *name++) != '\0') deposit(c);
    if (string != 0) {
	if (separator != 0)
	    while ((c = *separator++) != '\0') deposit(c);
	deposit('"');
	while ((c = *string++) != '\0') {
	    if (c == '\"' || c == '\\') deposit('\\');
	    deposit(c);
	}
	deposit('"');
    }
    if (suffix != 0)
	while ((c = *suffix++) != '\0') deposit(c);
    op = d;
#   undef deposit
}


static void lsp_start(wchar const *root, wchar const *pubid) {
    write_lsp(0, "(", (wchar*)"DOCUMENT", 0, 0, 0);
    if (root  != 0) write_lsp(0, " (", (wchar*)"ROOT",   " . ", root , ")");
    if (pubid != 0) write_lsp(0, " (", (wchar*)"PUBLIC", " . ", pubid, ")");
    level++;
}    

static void lsp_finish(void) {
    while (level > 0) {
        level--;
        write_char(')');
    }
    finish_with_newline();
}

static void lsp_begin(wchar const *tag, struct ROKXML_binding *atts) {
    struct ROKXML_binding *b;

    write_lsp(1, "(", tag, 0, 0, 0);
    for (b = atts; b != 0; b = b->next)
       write_lsp(0, " (", b->name, " . ", b->value, ")");
    level++;
}

static void lsp_end(wchar const *tag, struct ROKXML_binding *atts) {
    write_char(')');
    level--;
}

static void lsp_empty(wchar const *tag, struct ROKXML_binding *atts) {
    lsp_begin(tag, atts);
    lsp_end(tag, atts);
}

static void lsp_text(wchar const *text) {
    if (drop_white_space && is_white_space(text)) return;
    write_lsp(1, 0, 0, 0, text, 0);
}

static void lsp_cdata(wchar const *cdata) {
    write_lsp(1, "#(", (wchar*)"CDATA", " ", cdata, ")");
}

static void lsp_entity(wchar const *entity) {
    write_lsp(1, 0, entity, 0, 0, 0);
}

static void lsp_comment(wchar const *s) {
    write_lsp(1, "#(", (wchar*)"COMMENT", " ", s, ")");
}
    
static void lsp_pi(wchar const *target, wchar const *data) {
    write_lsp(1, "#(", (wchar*)"PI", 0, 0, 0);
    write_lsp(0, " ", target, " ", data, ")");
}

struct ROKXML_handlers lsp_handlers = {
    lsp_begin,
    lsp_end,
    lsp_empty,
    lsp_text,
    lsp_cdata,
    lsp_entity,
    lsp_comment,
    lsp_pi,
    lsp_start,
    lsp_finish
};


/*  HANDLER SELECTION FROM THE COMMAND LINE.
   The options for handlers are

    -hn			null handlers
    -hs			summary handlers (count elements, optionally atts)
    -hd			text handlers (data)
    -he			esis handlers
    -hc			cxml handlers
    -ht			TMN (Tiny Markup Notation)
    -hl			Lisp

    followed by some or all of the following:
	=		do want CDATA (=) records
	?		do want PI (?) records
	!		do want COMMENT (!) records
	*		want them all
	w		Suppress white space strings?
*/

struct ROKXML_handlers const *get_handlers(char const *opt) {
    char const *o = opt;
    int want_cdata   = 0;
    int want_pi      = 0;
    int want_comment = 0;
    struct ROKXML_handlers *r;

    switch (*o++) {
	case 'n': case 'N': r = &null_handlers; break;
	case 'c': case 'C': r = &cxml_handlers; break;
	case 'd': case 'D': r = &text_handlers; break;
	case 'e': case 'E': r = &esis_handlers; break;
	case 't': case 'T': r = &tmn_handlers;  break;
        case 'l': case 'L': r = &lsp_handlers;  break;
        case 's': case 'S': r = &sum_handlers;  break;
	default: quit("unknown value of -h option"); break;
    }
    while (*o != '\0')
	switch (*o++) {
	    case 'w': case 'W': case ' ': drop_white_space = 1; break;
	    case 'c': case 'C': case '!': want_comment = 1; break;
	    case 'd': case 'D': case '=': want_cdata   = 1; break;
	    case 'p': case 'P': case '?': want_pi      = 1; break;
	    case 'a': case 'A': case '*': want_comment = want_cdata =
                                          want_pi      = 1; break;
	    default: quit("unknown modifier of -h option"); break;
	}
    if (!want_comment) r->comment = 0;
    if (!want_cdata  ) r->cdata   = 0;
    if (!want_pi     ) r->pi      = null_pi;
    return (struct ROKXML_handlers const *)r;
}