/*  File   : qh.c
    Author : Richard A. O'Keefe
    Updated: %G%
    Purpose: Small fast non-validating XML parser.

    THERE IS A BUG.  If the input contains a \0, qh stops.
*/

#ifndef	lint
static	char SCCSid[] = "%Z%%E% %M%	%I%";
#endif/*lint*/

/*  If we see
    \0			Pop the source stack.
    &			It's a character reference or entity reference
        &#		It's a decimal or hex character reference
	    &#[xX]	It's a hexadecimal character reference.
			Look for [0-9a-fA-F]+[;]? following.
			Insert the result as a character
	    &#[0-9]	It's a decimal character reference.
			Look for [0-9]*[;]? following.
			Insert the result as a character.
	&[A-Za-z]	It's an entity reference.
			Look for [-_.:A-Za-z0-9]*[;]? following.
			Push the named entity on the stack.
    	&[^#A-Za-z]	It's a literal ampersand.
			Insert the ampersand as a character.
			Push the following characters back for rescanning.

    <			It's a PI, comment, declaration, or tag.
	<?		It's a PI.  Look for .*?> following.
	<!		It's a comment or declaration.
		<![	It's a CDATA section.  Look for "CDATA[".
		<!-	It's a comment.
		<![A-Z]	It's a declaration.
		else	It's an error.
	</>		It's an empty end-tag.  Close the top element on
			the stack, whatever it is.
	</		It's an end-tag tag.  Look for
			([A-Za-z][-_.:A-Za-z0-9]*)?[ \t\n]*> following.
			Process it as such.
	<>		It's an empty start-tag.  Use last_tag.  If that
			is NULL, this must be the first tag, and that may
			not be empty, so use "#EMPTY" instead.
	<[A-Za-z]	It's an open tag.  Look for
			[-_.:A-Za-z0-9]* following, then process attrs.
	<[^?!/A-Za-z]	It's a literal less than sign.
			Insert it as a character.
			Push the following character back for rescanning.
    [^&<]		It's a literal character (including >).
			Insert it as a character.
			If it was a \n, increment the line number.
			

    At the moment, I only want to deal with one kind of declaration,
    and I'll allow it out of its proper place, and it will be wrong.
    <!ENTITY name [CDATA ]string>

    Input will be managed as a stack.
    Each element will have
	- name (file name for files, entity name for entities)
	- a file descriptor (negative for entities)
	- a buffer (pointer to the beginning)
	- a current pointer
	- a line number
*/
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "kind.h"
#include "qh.h"

#ifdef WIDE_CHARS

static size_t
ustrlen(wchar const *x) {
    wchar const *s;

    for (s = x; *s++ != 0; ) {}
    return (s-x)-1;
}

static void
ustrcpy(wchar *d, wchar const *s) {
    while ((*d++ = *s++) != '\0') {}
}

static int
ustrcmp(wchar const *x, wchar const *y) {
    wchar c;

    while ((c = *x++) == *y++) if (c == 0) return 0;
    return c < y[-1] ? -1 : 1;
}

static void
convert(wchar *d, unsigned char const *s, size_t n) {
    while (n-- != 0) *d++ = *s++;
}

#else

#define ustrcmp(x,y) strcmp((char const*)(x), (char const*)(y))

#define ustrlen(x) strlen((char const*)(x))

#define ustrcpy(d, s) (void)strcpy((char *)(d), (char const *)(s))

#define convert(d, s, n) (void)memcpy(d, s, n)
#endif

static wchar *
ustrdup(wchar *s) {
    size_t n = ustrlen(s) + 1;
    wchar *r = malloc(sizeof (wchar) * n);

    if (r == 0) quit("memory ran out");
    (void)memcpy(r, s, sizeof (wchar) * n);
    return r;
}

static wchar *
dupcvt(char const *s) {
    size_t n = strlen(s) + 1;
    wchar *r = malloc(sizeof (wchar) * n);

    if (r == 0) quit("memory ran out");
    convert(r, (unsigned char const *)s, n);
    return r;
}

static void
uputs(FILE *f, wchar const *s) {
    wchar c;

    while ((c = *s++) != '\0') putc(c, f);
}


/*  All the characters we give to the back end are held in a single giant
    array.  The contents of this array have the layout
	{name,{name,value}*}*[character data][entity name]
    If we suppose that
        - tags might be nested 1000 deep
        - each tag might have 8 attributes
        - values might be 80-character file names
        - names are 15 characters each
    then we need 1000*(15+1 + 8*(15+1+80+1)) = 792,000 characters.
    If we also want room for 1MB of character data (not impossible,
    I have such a file) then all things considered about 2MB seems
    about right.
*/
static wchar bigbuf[2*1024*1024];

#define MAX_NAME 2000

#define MAX_ENTS 300
struct Entity {
    wchar *name;
    wchar *value;
    int   is_cdata;
};
static struct Entity entity_dict[MAX_ENTS];
static int           nents;

#define MAX_STACK 100
struct Source {
    wchar *curr;
    wchar *buff;
    wchar *name;    
    int    fd;
    int    lino;
};
static struct Source source_stack[MAX_STACK];
static int           ssp = 0;
static int           sslock = 0;

#define lock_source_stack    sslock = ssp
#define unlock_source_stack  sslock = 0

static int status = EXIT_SUCCESS;

#define MAX_TAGS 1000
static struct ROKXML_binding tag_stack[MAX_TAGS];
static int        tsp = 0;
static wchar      last_tag[128] = {'#','E','M','P','T','Y',0};

#define char_ent(n, s) {{n,0}, s}
static struct {char const *value; char const *name;} char_ents[] = {
    {"&",    "amp"},
    {"<",    "lt"},
    {">",    "gt"},
    {"'",    "apos"},
    {"\"",   "quot"},
    {"\240", "nbsp"},
    {"\241", "iexcl"},
    {"\242", "cent"},
    {"\243", "pound"},
    {"\244", "curren"},
    {"\245", "yen"},
    {"\246", "brvbar"},
    {"\247", "sect"},
    {"\250", "uml"},
    {"\251", "copy"},
    {"\252", "ordf"},
    {"\253", "laquo"},
    {"\254", "not"},
    {"\255", "shy"},
    {"\256", "reg"},
    {"\257", "macr"},
    {"\260", "deg"},
    {"\261", "plusmn"},
    {"\262", "sup2"},
    {"\263", "sup3"},
    {"\264", "acute"},
    {"\265", "micro"},
    {"\266", "para"},
    {"\267", "middot"},
    {"\270", "cedil"},
    {"\271", "sup1"},
    {"\272", "ordm"},
    {"\273", "raquo"},
    {"\274", "frac14"},
    {"\275", "frac12"},
    {"\276", "frac34"},
    {"\277", "iquest"},
    {"\300", "Agrave"},
    {"\301", "Aacute"},
    {"\302", "Acirc"},
    {"\303", "Atilde"},
    {"\304", "Auml"},
    {"\305", "Aring"},
    {"\306", "AElig"},
    {"\307", "Ccedil"},
    {"\310", "Egrave"},
    {"\311", "Eacute"},
    {"\312", "Ecirc"},
    {"\313", "Euml"},
    {"\314", "Igrave"},
    {"\315", "Iacute"},
    {"\316", "Icirc"},
    {"\317", "Iuml"},
    {"\320", "ETH"},
    {"\321", "Ntilde"},
    {"\322", "Ograve"},
    {"\323", "Oacute"},
    {"\324", "Ocirc"},
    {"\325", "Otilde"},
    {"\326", "Ouml"},
    {"\327", "times"},
    {"\330", "Oslash"},
    {"\331", "Ugrave"},
    {"\332", "Uacute"},
    {"\333", "Ucirc"},
    {"\334", "Uuml"},
    {"\335", "Yacute"},
    {"\336", "THORN"},
    {"\337", "szlig"},
    {"\340", "agrave"},
    {"\341", "aacute"},
    {"\342", "acirc"},
    {"\343", "atilde"},
    {"\344", "auml"},
    {"\345", "aring"},
    {"\346", "aelig"},
    {"\347", "ccedil"},
    {"\350", "egrave"},
    {"\351", "eacute"},
    {"\352", "ecirc"},
    {"\353", "euml"},
    {"\354", "igrave"},
    {"\355", "iacute"},
    {"\356", "icirc"},
    {"\357", "iuml"},
    {"\360", "eth"},
    {"\361", "ntilde"},
    {"\362", "ograve"},
    {"\363", "oacute"},
    {"\364", "ocirc"},
    {"\365", "otilde"},
    {"\366", "ouml"},
    {"\367", "divide"},
    {"\370", "oslash"},
    {"\371", "ugrave"},
    {"\372", "uacute"},
    {"\373", "ucirc"},
    {"\374", "uuml"},
    {"\375", "yacute"},
    {"\376", "thorn"},
    {"\377", "yuml"},
    {"``",   "ldquo"},
    {"''",   "rdquo"},
    {"`",    "lsquo"},
    {"\'",   "rsquo"},
    {"--",   "mdash"},
    {"...",  "hellip"},
    {"pi",   "pi"},
    {"~=",   "ne"},
    {"<=",   "le"},
    {">=",   "ge"},
    {":",    "vellip"},
    {"[forall]", "forall"},
    {"",     ""}
};

static void
load_char_ents(void) {
    for (nents = 0; char_ents[nents].value[0] != 0; nents++) {
	entity_dict[nents].name     = dupcvt(char_ents[nents].name);
	entity_dict[nents].value    = (wchar *)char_ents[nents].value,
	entity_dict[nents].is_cdata = 1;
    }
}

static void
say_where(void) {
    int i;

    for (i = 0; i < ssp; i++) {
	fprintf(stderr, source_stack[i].fd < 0 ? "Entity " : "File ");
	uputs(stderr, source_stack[i].name);
	fprintf(stderr, ", line %d\n", source_stack[i].lino);
    }
    for (i = 0; i < tsp; i++) {
	fprintf(stderr, "%.*s<", (i <= 40 ? i : 40),
	        "                                        ");
	uputs(stderr, tag_stack[i].name);
	fprintf(stderr, " ...>\n");
    }
    status = EXIT_FAILURE;
}

void
quit(char const *msg) {
    say_where();
    fprintf(stderr, "Error: %s\n", msg);
    exit(status);
}

void
quit1(char const *msg, wchar const *arg1) {
    say_where();
    fprintf(stderr, "Error ");
    fprintf(stderr, msg, (char const *)arg1);
    fprintf(stderr, "\n");
    exit(status);
}

void
quit2(char const *msg, wchar const *arg1, wchar const *arg2) {
    say_where();
    fprintf(stderr, "Error ");
    fprintf(stderr, msg, (char const *)arg1, (char const *)arg2);
    fprintf(stderr, "\n");
    exit(status);
}

void
warn(char const *msg) {
    int e = status;

    say_where();
    fprintf(stderr, "Warning: %s\n", msg);
    status = e;
}

static int unknown_entities_are_ok = 0;

static struct Entity *
find_entity(wchar *name) {
    struct Entity *p = &entity_dict[0];
    struct Entity *e = &p[nents];

    for (; p != e; p++)
	if (ustrcmp(p->name, name) == 0) return p;
    if (!unknown_entities_are_ok) {
	say_where();
	uputs(stderr, name);
	fprintf(stderr, " is not a known entity\n");
    }
    return NULL;
}

static wchar *
push_entity(struct Entity *ent, wchar *curr) {
    struct Source *p = &source_stack[ssp];

    if (ssp > 0) p[-1].curr = curr;
    if (ssp == MAX_STACK) quit("source stack overflowed");
    p->name = ent->name;
    p->buff = ent->value;
    p->curr = p->buff;
    p->lino = 1;
    p->fd   = -1;
    ssp++;
    return p->curr;
}

#define BUFF_SIZE (16*1024)
#define MAX_BUFFS (16*1024)


static wchar *
push_file(char const *name, wchar *curr) {
    struct Source *p = &source_stack[ssp];
    int n;
    static unsigned char *buffer[MAX_BUFFS];
    int    buflen[MAX_BUFFS];
    int    totlen;
    int    bsp, i;
    wchar *d;

    if (ssp > 0) p[-1].curr = curr;
    if (ssp == MAX_STACK) quit("source stack overflowed");
    p->name = dupcvt(name);
    p->lino = 1;
    if (strcmp(name, "-") == 0 || strcmp(name, "/dev/stdin") == 0) {
    	p->fd = 0;
    } else {
	p->fd = open(name, O_RDONLY, 0);
	if (p->fd < 0) {
	    perror(name);
	    quit("file could not be opened");
	}
    }
    totlen = 0;
    for (bsp = 0; ; bsp++) {
	if (buffer[bsp] == NULL) {
	    buffer[bsp] = malloc(BUFF_SIZE);
	    if (buffer[bsp] == 0) quit("file buffer memory ran out");
	}
	n  = read(p->fd, buffer[bsp], BUFF_SIZE);
	if (n <= 0) break;
	buflen[bsp] = n;
	totlen += n;
    }
    if (n < 0) {
	perror(name);
	quit("input error");
    }
    if (p->fd != 0) (void) close(p->fd);
    d = malloc(sizeof (wchar) * (totlen+1));
    if (d == 0) quit("file buffer memory ran out");
    p->curr = p->buff = d;
    for (i = 0; i < bsp; i++) {
	n = buflen[i];
	convert(d, buffer[i], n);
	d += n;
    }
    *d = 0;
    ssp++;
    return p->curr;
}

static wchar *
pop_source(void) {
    struct Source *p;

    if (ssp == 0) quit("source stack underflowed");
    if (ssp == sslock) quit("entity ended within value literal");
    p = &source_stack[--ssp];
    if (p->fd >= 0) free(p->buff);
    return ssp == 0 ? NULL : source_stack[ssp-1].curr;
}

static struct ROKXML_handlers const *h;

#if 0

static void
process_PI(wchar *pi) {
    wchar *p, *pe;		/* "name" part */ 
    wchar *q, *qe;		/* rest, minus leading/trailing spaces */

    pe = p = pi;
    if (is_alpha(*pe))
	do pe++; while (is_alnum(*pe));
    q = pe;
    while (is_white(*q)) q++;
    qe = q;
    while (*qe != '\0') qe++;
    while (qe != q && is_white(qe[-1])) qe--;
#ifndef PARSE_ONLY
    /* ESIS just wants the whole string */
    write_string('?', pi);
    /*printf("PI name='%.*s' value = '%.*s'\n",
	(int)(pe-p), (char*)p, (int)(qe-q), (char*)q);*/
#endif
}

#endif

static void
free_bindings(struct ROKXML_binding *list) {
    struct ROKXML_binding *p, *q;

    for (p = list; p != 0; p = q) {
	q = p->next;
	if (p->name  != 0) free(p->name);
	if (p->value != 0) free(p->value);
    }
}

static void
pop_tag_stack(void) {
    struct ROKXML_binding *t = &tag_stack[--tsp];

    free_bindings(t->next);
    ustrcpy(last_tag, t->name);
}

static void
process_end_tag(wchar *name) {
    if (tsp == 0) {
	quit1("end tag </%s> with no open elements", name);
    } else {
	struct ROKXML_binding *t = &tag_stack[tsp-1];

	if (*name != 0 && ustrcmp(name, t->name) != 0) {
	    quit2("<%s> is open but found </%s> instead", t->name, name);
	} else {
	    h->end(t->name, t->next);
	    tsp--;
	    free_bindings(t->next);
	    ustrcpy(last_tag, t->name);
	}
    }
}

static void
begin_start_tag(wchar *name) {
    if (tsp == MAX_TAGS) quit("element stack overflowed");
    if (name == 0 || *name == '\0') name = last_tag;
    if (name == 0 || *name == '\0')
	quit("<> is not legal without a preceding element\n");
    tag_stack[tsp].name = ustrdup(name);
    tag_stack[tsp].next = NULL;
    if (last_tag[0] == '#') ustrcpy(last_tag, tag_stack[tsp].name);
}

static void
add_binding_for(wchar *name) {
    struct ROKXML_binding *t = &tag_stack[tsp];
    struct ROKXML_binding *r;

    r = malloc(sizeof *r);
    if (r == 0) quit("memory ran out");
    r->next = t->next;
    r->name = ustrdup(name);
    r->value = NULL;
    t->next = r;
}

static void
set_binding_value(wchar *value) {
    tag_stack[tsp].next->value = ustrdup(value);
}

static struct ROKXML_binding *
merge(struct ROKXML_binding *a, struct ROKXML_binding *b) {
    struct ROKXML_binding *r = NULL;
    struct ROKXML_binding **e = &r;
    struct ROKXML_binding *t;
    int c;

    while (a != NULL && b != NULL) {
	c = ustrcmp(a->name, b->name);
	if (c < 0) {
	    *e = a;
	    e = &a->next;
	    a = *e;
	} else
	if (c > 0) {
	    *e = b;
	    e = &b->next;
	    b = *e;
	} else {
	    say_where();
	    if (ustrcmp(a->value, b->value) == 0) {
		fprintf(stderr, "Duplicate bindings for ");
		uputs(stderr, a->name);
		fprintf(stderr, "\n");
	    } else {
		fprintf(stderr, "Conflicting bindings for ");
		uputs(stderr, a->name);
		fprintf(stderr, ";  later one taken\n");
	    }
	    t = a;
	    a = t->next;
	    free(t->name);
	    free(t->value);
	    *e = b;
	    e = &b->next;
	    b = *e;
	}
    }
    *e = a != NULL ? a : b;     
   return r;
}
	    
static struct ROKXML_binding *
msort(struct ROKXML_binding *a) {
    struct ROKXML_binding *p, *q;
    
    if (a == NULL) return a;
    p = q = a;
    while ((p = p->next) != NULL && p->next != NULL)
	q = q->next, p = p->next;
    /* Now q is half way along */
    p = q->next;
    q->next = NULL;
    return p == NULL ? a : merge(msort(a), msort(p));
}


static void
process_start_tag(
    void (*handler)(wchar const *, struct ROKXML_binding *)
) {
    struct ROKXML_binding *t = &tag_stack[tsp];

    tsp++;
    t->next = msort(t->next);
    handler(t->name, t->next);
}

static wchar *
parse_string(wchar *curr, int q, wchar *d, wchar *e) {
    wchar  c;
    static char oflo[] = "value buffer overflow";
#   define deposit(c) if (d == e) quit(oflo); else *d++ = c
#ifdef WARN_ABOUT_LT
#define LAST_SPECIAL '<'
#else
#define LAST_SPECIAL '&'
#endif

    lock_source_stack;
    if (q == 0) {
	/* legal SGML, not legal XML */
	while (is_alnum(c = *curr++)) deposit(c);
	curr--;
    } else {
	for (;;) {
	    c = *curr++;
	    if (c > LAST_SPECIAL) {
		deposit(c);
	    } else
	    if (c <= ' ') {
		if (c == ' ') {
		    deposit(c);
		} else
		if (c == '\0') {
		    curr = pop_source();
		    if (curr == NULL) quit("locking broke");
		} else {
#ifdef GENERAL_LINE_END
		    if (c == '\r') {
			if (*curr == '\n') curr++;
			c = '\n';
		    }
#endif
		    if (c == '\n') source_stack[ssp-1].lino++;
		    else if (c != '\t') quit("non-layout control char");
		    deposit(' ');
		}
	    } else
	    if (c == q) {
		break;
	    } else
	    if (c == '&') {
#ifdef GENERAL_LINE_END
#		define end_ref {				\
		    if (c == '\r')				\
			if (*curr == (c = '\n') curr++;		\
		    if (c == '\n') source_stack[ssp-1].lino++;	\
		    else if (c != ';') curr--;			\
		}
#else
#               define end_ref {				\
		    if (c == '\n') source_stack[ssp-1].lino++;	\
		    else if (c != ';') curr--;			\
		}
#endif

		c = *curr++;
		if (c == '#') {
		    c = *curr++;
		    if (c == 'x' || c == 'X') {
			int n = 0;

			while (is_hexit(c = *curr++))
			    n = n*16 + kind[c];
			end_ref
			deposit(n);
		    } else
		    if (c >= '0' && c <= '9') {
			int n = 0;

			do {
			    n = n*10 + (c - '0');
			    c = *curr++;
			} while (c >= '0' && c <= '9');
			end_ref
			deposit(n);
		    } else {
			/* &#? is not a character reference */
			say_where();
			fprintf(stderr,
				"malformed character reference &#%c\n", c);
			curr -= 2;
			deposit('&');
		    }
		} else
		if (is_alpha(c)) {
		    struct Entity *y;
		    wchar *t;

		    t = d;
		    do {
			if (t == e) quit("name buffer overflow");
		       *t++ = c;
		    } while (is_alnum(c = *curr++));
		    end_ref
		    *t = 0;
		    y = find_entity(d);
		    if (y == NULL) {
			t = d;
			while ((c = *t++) != '\0') deposit(c);
		    } else
		    if (y->is_cdata) {
			t = y->value;
			while ((c = *t++) != '\0') deposit(c);
		    } else {
			curr = push_entity(y, curr);
		    }
		} else {
		    /* &? is plain text and is legal */		
		    curr -= 1;
		    deposit('&');
		}
#               undef end_ref
	    } else {
#ifdef WARN_ABOUT_LT
		if (c == '<') warn("XML 1.0 pointlessly forbids '<' here");
#endif
		deposit(c);
	    }
	}
    }
    unlock_source_stack;
    *d = 0;
    return curr;
#   undef deposit
}

static int
is_prefix(char const *pat, wchar const *src) {
    wchar c;

    while ((c = *(unsigned char *)pat++) != '\0')
	if (c != *src++) return 0;
    return 1;
}

static void
parse(void) {
    wchar *curr = source_stack[0].curr;
    wchar  c;
    wchar * const e = &bigbuf[sizeof bigbuf/sizeof bigbuf[0]];
    wchar * b = bigbuf;
    wchar *d = b;
    static char oflo[] = "string buffer overflowed";
#   define data_character(c) {				\
	if (d == e) quit(oflo);				\
	*d++ = c;					\
    }
#   define flush_data_buff {				\
        if (d != b && tsp != 0) {			\
	    *d = '\0';					\
	    h->text(b);				\
	}						\
	d = b;					\
    }
#   define increment_line_number source_stack[ssp-1].lino++
#   define deblank while (is_white(c)) {		\
	if (c == '\n') increment_line_number;		\
	c = *curr++;					\
    }

    for (;;) {
	c = *curr++;
	if (c == '\0') {
	    curr = pop_source();
	    if (curr == NULL) {
		flush_data_buff
		return;
	    }
	} else
	if (c == '&') {
#           define end_ref {				\
		if (c == '\n') increment_line_number;	\
		else if (c != ';') curr--;		\
	    }

	    c = *curr++;
	    if (c == '#') {
	        c = *curr++;
	        if (c == 'x' || c == 'X') {
		    wchar n = 0;
		    
		    while (is_hexit(c = *curr++))
			n = n*16 + kind[c];
		    end_ref
		    data_character(n);
		} else
		if (c >= '0' && c <= '9') {
		    wchar n = 0;
		    
		    do {
		        n = n*10 + (c - '0');
		        c = *curr++;
		    } while (c >= '0' && c <= '9');
		    end_ref
		    data_character(n);
		} else {
		    /* &#? is not a character reference */
		    say_where();
		    fprintf(stderr, "malformed character reference &#%c\n", c);
		    curr -= 2;
		    data_character('&');
		}
	    } else
	    if (is_alpha(c)) {
		struct Entity *y;
		wchar * const old_d = d;

		d++;	/* make room for \0 if we need to flush */
		do {
		    data_character(c);
		} while (is_alnum(c = *curr++));
		end_ref
		*d = 0;
		d = old_d;
		y = find_entity(old_d + 1);
		if (y == NULL) {
		    flush_data_buff
		    h->entity(old_d + 1);
		} else
		if (y->is_cdata) {
		    wchar *s = y->value;

		    while ((c = *s++) != '\0') data_character(c);
		} else {
		    curr = push_entity(y, curr);
		}
	    } else {
		/* &? is plain text and is legal */		
		curr -= 1;
		data_character('&');
	    }
#           undef end_ref
	} else
	if (c != '<') {
	    if (c == '\n') increment_line_number;
	    data_character(c);
	} else
	if ((c = *curr++) == '?') {
	    flush_data_buff
	    for (;;) {
		c = *curr++;
		if (c == '?' && *curr == '>') {
		    curr++;
		    break;
		}
		if (c == '\n') increment_line_number;
		data_character(c);
	    }
	    *d = 0;
	    for (d = b; *d > ' '; d++) {}
	    if (*d == ' ') *d++ = '\0'; else d = (wchar *)0;
	    h->pi(b, d);
	    d = b;
	} else
	if (c == '!') {
	    c = *curr++;
	    if (c == '[') {
		/* SGML has <![ (IGNORE|INCLUDE|TEMP|CDATA|RCDATA)* [ */
		/* but XML has only <![CDATA[ with no spaces. */
		if (!is_prefix("CDATA[", curr))
		    quit("'<![' not followed by 'CDATA['");
		if (h->cdata != 0) flush_data_buff
		curr += sizeof "CDATA[" - 1;
		for (;;) {
		    c = *curr++;
		    if (c == ']' && curr[0] == ']' && curr[1] == '>') break;
		    data_character(c);
		}
		if (h->cdata != 0) {
		    *d = 0;
		    h->cdata(b);
		    d = b;
		}
		curr += 2;
	    } else
	    if (c == '-') {
		if (h->comment != 0) {
		    do {
			if (curr[0] != '-') quit("<!- missing second dash");
			curr++;
			flush_data_buff		    
			do {
			    c = *curr++;
			    if (c == '\0') quit("&entity ended in comment");
			    if (c == '\n') increment_line_number;
			    data_character(c);
			} while (c != '-' || curr[0] != c);
			*--d = '\0';
			if (tsp != 0) h->comment(b);
			d = b;
			curr++;
			c = *curr++;
			deblank
		    } while (c == '-');
		} else {
		    do {
			if (curr[0] != '-') quit("<!- missing second dash");
			curr++;
			do {
			    c = *curr++;
			    if (c == '\0') quit("&entity ended in comment");
			    if (c == '\n') increment_line_number;
			} while (c != '-' || curr[0] != c);
			curr++;
			c = *curr++;
			deblank
		    } while (c == '-');
		}
		if (c != '>') quit("<!-- comment -- missing its closing >");
	    } else
	    if (c == '>') {
		/* <!> is a legal empty comment in SGML & HTML */
		/* XML doesn't allow it, but it's quite harmless */
		warn("<!> comments are not allowed in XML");
	    } else
	    if (is_alnum(c)) {
		int n = 1;

		/* Only expecting <!DOCTYPE> here. */
		/* For testing purposes, I'll allow <!ENTITY>. */
		/* I want my own <!COPY (PUBLIC pubid [sysid]|SYSTEM sysid)> */
		fprintf(stderr, "<!declaration> skipped\n");
		for (;;) {
		    switch (c = *curr++) {
			case '>' : if (--n == 0) break; continue;
			case '<' : n++; continue;
			case '\"': while ((c = *curr++) != '\"' && c != 0) {}
				   if (c == 0) abort(); continue;
			case '\'': while ((c = *curr++) != '\'' && c != 0) {}
				   if (c == 0) abort(); continue;
			case '-' : if (curr[0] == c) {
				       do curr++;
				       while (curr[0] != c || curr[1] != c);
				       curr++;
				   }				           
				   /*FALLTHROUGH*/
			default  : continue;
		    }
		    break;
		}
	    } else {
		quit("malformed <!declaration");
	    }
	} else
	if (c == '/') {
	    flush_data_buff
	    c = *curr++;
	    if (is_alpha(c)) {
		do {
		    data_character(c);
		} while (is_alnum(c = *curr++));
	    }
	    *d = 0;
	    d = b;
	    deblank
	    if (c == '>') {
		(void)0;
	    } else
	    if (c == '<') {
		say_where();
		fprintf(stderr, "</");
		uputs(stderr, d);
		fprintf(stderr, "< is not allowed in XML\n");
		curr--;
	    } else
	    if (c == '/' && *curr == '>') {
		say_where();
		fprintf(stderr, "</");
		uputs(stderr, d);
		fprintf(stderr, "/> should use > not />");
		curr++;
	    } else {
		quit("mangled end tag");
	    }
	    process_end_tag(d);
	} else
	if (is_alpha(c)) {
	    flush_data_buff
	    do {
		data_character(c);
	    } while (is_alnum(c = *curr++));
	    *d = 0;
	    d = b;
	    begin_start_tag(d);
	    for (;;) {
		deblank
		if (!is_alpha(c)) break;
		d = b;
		do {
		    data_character(c);
		} while (is_alnum(c = *curr++));
		*d = 0;
		d = b;
		add_binding_for(d);
		deblank
		if (c != '=') quit("= missing after attribute");
		c = *curr++;
		deblank
		if (c != '\'' && c != '\"') curr--, c = '\0';
		curr = parse_string(curr, c, d, e);
		c = ' ';
		set_binding_value(d);
	    }
	    if (c == '>') {
		process_start_tag(h->begin);
	    } else
	    if (c == '<') {
		say_where();
		fprintf(stderr, "<");
		uputs(stderr, tag_stack[tsp-1].name);
		fprintf(stderr, "< is not allowed in XML\n");
		curr--;
		process_start_tag(h->begin);
	    } else
	    if (c == '/' && *curr == '>') {
		curr++;
		process_start_tag(h->empty);
		pop_tag_stack();
	    } else
	    if (c == '\'' || c == '\"' || c == '=') {
		quit("attribute name missing");
	    } else {
		quit("mangled end tag");
	    }
	} else
	if (c == '>') {
	    flush_data_buff
	    warn("<> empty start-tag is not allowed in XML");
	    b[0] = '\0';
	    begin_start_tag(b);
	    process_start_tag(h->begin);
	} else {
	    curr--;
	    data_character('<');
	}
    }
#   undef deblank
#   undef increment_line_number
}

int
main(int argc, char **argv) {
    int handlers_set = 0;

    set_up_kind();
    load_char_ents();

    while (argc > 1 && argv[1][0] == '-') {
	switch (argv[1][1]) {
	    case 'e': case 'E':
		unknown_entities_are_ok = 1;
		argc--, argv++;
		continue;
	    case 'h': case 'H':
		argc--, argv++;
		if (argv[0][2] == '\0') {
		    argc--, argv++;
		    h = get_handlers(argv[0]);
		} else {
		    h = get_handlers(argv[0]+2);
		}
		handlers_set = 1;
		continue;
	    case 'k': case 'K':
		argc--, argv++;
		if (argv[0][2] == '\0') {
		    argc--, argv++;
		    new_counter(argv[0]);
		} else {
		    new_counter(argv[0]+2);
		}
		continue;
	    case '-':
		argc--, argv++;
		break;
	    case '\0':
		break;
	    default:
		fprintf(stderr, "Unknown option: %s\n", argv[1]);
		exit(EXIT_FAILURE);
	}
	break;
    }
    if (!handlers_set) h = get_handlers("e?");
    h = install_counters(h);
    (void) push_file(argc > 1 ? argv[1] : "-", NULL);
    h->start((unsigned char const*)0, (unsigned char const*)0);
    parse();
    h->finish();
    return status;
}