/* File : output.c Author : Richard A. O'Keefe Updated: 07/20/01 Purpose: Output functions for qh. */ #include #include #include "qh.h" #define OUTBUF_SIZE (64*1024) static wchar outbuf[OUTBUF_SIZE]; static wchar *op = outbuf; static wchar * oflush(wchar *e) { ssize_t m, n = e-outbuf; unsigned char *s; #ifdef WIDE_CHARS wchar *p = outbuf; s = (unsigned char *)outbuf; while (p != e) *s++ = *p++; #endif s = (unsigned char *)outbuf; m = write(STDOUT_FILENO, s, n); while (m < n) { if (m < 0) quit("output error"); s += m; n -= m; m = write(STDOUT_FILENO, s, n); } return outbuf; } /* write_char('.'); writes a single ASCII character; does no escaping. */ static void write_char(char c) { wchar *d = op, *e = &outbuf[OUTBUF_SIZE]; if (d == e) d = oflush(d); *d++ = c; op = d; } /* write_chars("...") writes a sequence of ASCII characters; does no escaping. */ static void write_chars(char const *p) { char c; wchar *d = op, *e = &outbuf[OUTBUF_SIZE]; while ((c = *p++) != '\0') { if (d == e) d = oflush(d); *d++ = c; } op = d; } /* write_text(a, [...], z) writes a sequence of wchars, with characters that don't fit escaped as \ooo or \#dddddd; as appropriate. There may be ASCII characters a,z added at either end; a NUL character will not be written. \#ddddd; is used for characters >= 256; \ooo output is used for characters below ' ' or (for ASCII output) >= 127. */ static void write_text(wchar a, wchar const *p, wchar z) { wchar c, b; wchar *d = op, *e = &outbuf[OUTBUF_SIZE]; if (a != 0) { if (d == e) d = oflush(d); *d++ = a; } while ((c = *p++) != '\0') { if (d == e) d = oflush(d); #ifdef WIDE_CHARS if (c > 255) { wchar buff[14]; /* \# <10 digits> ; */ wchar *t = &buff[sizeof buff - 1]; *t-- = '\0'; *t-- = ';'; do *t-- = c%10 + '0'; while ((c /= 10) > 0); *t-- = '#'; *t-- = '\\'; while ((c = *t++) != '\0') { if (d == e) d = oflush(d); *d++ = c; } } else #endif if ( #ifdef ASCII_OUTPUT c < ' ' || c >= 127 || c == '\\' #else c < ' ' || c == '\\' #endif ) { *d++ = '\\'; if (d == e) d = oflush(d); if (c == '\n') c = 'n'; if (c >= ' ') { *d++ = c; } else { b = ((c >> 6) & 7) + '0'; *d++ = b; if (d == e) d = oflush(d); b = ((c >> 3) & 7) + '0'; *d++ = b; if (d == e) d = oflush(d); b = ((c >> 0) & 7) + '0'; *d++ = b; } } else { *d++ = c; } } if (z != 0) { if (d == e) d = oflush(d); *d++ = z; } op = d; } /* write_string(q, p, z) writes a sequence of wchars as XML data. q is the opening and closing quotation mark, if any (0 means none). z is an ASCII character to write afterwards; 0 means none. The characters <, &, and q are written as character references; characters outside the normal printing range are also written thus. */ static void write_string(wchar q, wchar const *p, wchar z) { wchar c; wchar *d = op, *e = &outbuf[OUTBUF_SIZE]; wchar buff[14]; /* &# <10 digits> ; */ wchar *t; if (q != 0) { if (d == e) d = oflush(d); *d++ = q; } while ((c = *p++) != '\0') { if ( #ifdef ASCII_OUTPUT c > 127 || #else #ifdef WIDE_CHARS c > 255 || #endif #endif (c < ' ' && c != '\n') || c == '<' || c == '&' || c == q ) { t = &buff[sizeof buff - 1]; *t-- = '\0'; *t-- = ';'; do *t-- = c%10 + '0'; while ((c /= 10) > 0); *t-- = '#'; *t-- = '&'; while ((c = *t++) != '\0') { if (d == e) d = oflush(d); *d++ = c; } } else { #if STRICT_CXML static wchar amp [] = {'&','a','m','p',';','\0'}; static wchar lt [] = {'&','l','t',';','\0'}; static wchar gt [] = {'&','g','t',';','\0'}; static wchar quot[] = {'&','q','u','o','t',';','\0'}; switch (c) { case '&': t = amp ; break; case '<': t = lt ; break; case '>': t = gt ; break; case '"': t = quot; break; default : if (d == e) d = oflush(d); *d++ = c; continue; } while ((c = *t++) != '\0') { if (d == e) d = oflush(d); *d++ = c; } #else if (d == e) d = oflush(d); *d++ = c; #endif } } if (q != 0) { if (d == e) d = oflush(d); *d++ = q; } if (z != 0) { if (d == e) d = oflush(d); *d++ = z; } op = d; } /* write_tmn(a, [...], t) writes a sequence of wchars, with characters that don't fit escaped as \c if ASCII punctuation or as \dddddd; otherwise. Newline and tab characters are not escaped. The ASCII character a will be added at the beginning if it is not NUL. The `t' argument says whether this is text (t == 1) or not (t == 0); this affects whether the ';' and '|' characters need escaping (t == 0) or not (t == 1). */ static void write_tmn(char a, wchar const *p, int t) { wchar c; wchar *d = op, *e = &outbuf[OUTBUF_SIZE]; static char special[256]; if (special['{'] == 0) { special[';'] = special['|'] = 1; special['{'] = special['}'] = special['\\'] = 2; } if (a != 0) { if (d == e) d = oflush(d); *d++ = a; } while ((c = *p++) != '\0') { #ifdef ASCII_OUTPUT #define CMAX 127 #else #define CMAX 255 #endif if (c > CMAX || (c < ' ' && c != '\t' && c != '\n')) { wchar buff[13]; /* \ <10 digits> ; */ wchar *b = &buff[sizeof buff]; wchar x; *--b = '\0'; *--b = ';'; do *b-- = c%10 + '0'; while ((c /= 10) > 0); *--b = '\\'; while ((x = *b++) != '\0') { if (d == e) d = oflush(d); *d++ = x; } } else { if (special[c] > t) { if (d == e) d = oflush(d); *d++ = '\\'; } if (d == e) d = oflush(d); *d++ = c; } } op = d; } static void finish_output(void) { op = oflush(op); } static void finish_with_newline(void) { if (op != outbuf && op[-1] != '\n') write_char('\n'); finish_output(); } /* White space handling. The proper thing to do is to have a validating parser which doesn't report element content white space in the first place. XSLT uses a hack where strings that are all white space are dropped. We do that here to, IF requested. But it really belongs in the parser. */ static int drop_white_space = 0; static int is_white_space(wchar const *s) { wchar c; while ((c = *s) <= ' ' && c != '\0') s++; return c == '\0'; } /* -------------------------------------------------------------------- NULL handlers. These don't write any output. Use them when you just want a syntax check. */ static void /*ARGSUSED*/ null_begin_end(wchar const *tag, struct ROKXML_binding *atts) {} static void /*ARGSUSED*/ null_pi(wchar const *target, wchar const *data) {} static void /*ARGSUSED*/ null_text(wchar const *text) {} static struct ROKXML_handlers null_handlers = { null_begin_end, null_begin_end, null_begin_end, null_text, null_text, null_text, null_text, null_pi, null_pi, finish_output }; /* -------------------------------------------------------------------- SUM handlers. -hs will write the gi of each start- or empty- tag. -hsw will also write the attributes, one per line, each preceded by a tab, AFTER the gi they apply to. Use this + sort | uniq -c or + an AWK script for summarising the use of tags. */ static void sum_begin(wchar const *tag, struct ROKXML_binding *atts) { struct ROKXML_binding *b; write_text('\0', tag, '\n'); if (drop_white_space) { /* I know, I know, I should have another flag variable. */ for (b = atts; b != 0; b = b->next) write_text('\0', b->name, '\n'); } } static struct ROKXML_handlers sum_handlers = { sum_begin, null_begin_end, sum_begin, null_text, null_text, null_text, null_text, null_pi, null_pi, finish_output }; /* -------------------------------------------------------------------- TEXT handlers. These write out the contents of #PCDATA, CDATA sections, and unexpanded entity references (as &foo;). This is not quite good enough for a spelling checker. See my spell.awk. There should be a separate spell.c using this interface. */ static void text_text(wchar const *text) { if (drop_white_space && is_white_space(text)) return; write_text('\0', text, '\0'); } static void text_entity(wchar const *entity) { write_text('&', entity, ';'); } static struct ROKXML_handlers text_handlers = { null_begin_end, null_begin_end, null_begin_end, text_text, text_text, text_entity, null_text, null_pi, null_pi, finish_with_newline }; /* -------------------------------------------------------------------- ESIS handlers. These write out a subset of the ESIS that you would get from SGMLS or NSGMLS. They can't write more than they do, as that would require processing the document type. */ static void esis_begin(wchar const *tag, struct ROKXML_binding *atts) { struct ROKXML_binding *b; for (b = atts; b != 0; b = b->next) { write_text('A', b->name, '\0'); write_chars(" CDATA"); write_text(' ', b->value, '\n'); } write_text('(', tag, '\n'); } static void /*ARGSUSED*/ esis_end(wchar const *tag, struct ROKXML_binding *atts) { write_text(')', tag, '\n'); } static void esis_empty(wchar const *tag, struct ROKXML_binding *atts) { esis_begin(tag, atts); esis_end(tag, atts); } static void esis_text(wchar const *text) { if (drop_white_space && is_white_space(text)) return; write_text('-', text, '\n'); } static void esis_cdata(wchar const *cdata) { write_text('=', cdata, '\n'); } static void esis_entity(wchar const *entity) { write_text('&', entity, '\n'); } static void esis_pi(wchar const *target, wchar const *data) { if (data == 0) { write_text('?', target, '\n'); } else { write_text('?', target, 0); write_text(' ', data, '\n'); } } static void esis_comment(wchar const *comment) { write_text('!', comment, '\n'); } static struct ROKXML_handlers esis_handlers = { esis_begin, esis_end, esis_empty, esis_text, esis_cdata, esis_entity, esis_comment, esis_pi, null_pi, finish_output }; /* -------------------------------------------------------------------- CXML (Canonical XML) handlers. These are for writing a document out as XML. This output file can be used with other parsers, notably my WBXML decoder. Some day soon I should write an ESIS reader. Real canonical XML uses extremely long lines. If you want to use James Clark's test suite, you really need real cxml. If you want to look at the output, you want line-broken cxml. Define STRICT_CXML to get the strict version. Perhaps this should be a run-time option. */ static void cxml_tag(wchar const *tag, struct ROKXML_binding *atts, char const *end) { struct ROKXML_binding *b; #if STRICT_CXML write_text('<', tag, '\0'); for (b = atts; b != 0; b = b->next) { write_text(' ', b->name, '='); write_string('"', b->value, '\0'); } #else write_text('<', tag, '\n'); for (b = atts; b != 0; b = b->next) { write_text('\0', b->name, '='); write_string('"', b->value, '\n'); } #endif write_chars(end); } static void cxml_begin(wchar const *tag, struct ROKXML_binding *atts) { cxml_tag(tag, atts, ">"); } static void /*ARGSUSED*/ cxml_end(wchar const *tag, struct ROKXML_binding *atts) { write_char('<'); write_text('/', tag, '\n'); write_char('>'); } static void cxml_empty(wchar const *tag, struct ROKXML_binding *atts) { cxml_tag(tag, atts, "/>"); } static void cxml_text(wchar const *text) { if (drop_white_space && is_white_space(text)) return; write_string('\0', text, '\0'); } static void cxml_cdata(wchar const *cdata) { write_chars(""); } static void cxml_entity(wchar const *entity) { write_text('&', entity, ';'); } static void cxml_pi(wchar const *target, wchar const *data) { write_chars(""); } static void cxml_comment(wchar const *pi) { write_chars("