/* File : qh.c Author : Richard A. O'Keefe Updated: %G% Purpose: Small fast non-validating XML parser. THERE IS A BUG. If the input contains a \0, qh stops. */ #ifndef lint static char SCCSid[] = "%Z%%E% %M% %I%"; #endif/*lint*/ /* If we see \0 Pop the source stack. & It's a character reference or entity reference &# It's a decimal or hex character reference &#[xX] It's a hexadecimal character reference. Look for [0-9a-fA-F]+[;]? following. Insert the result as a character &#[0-9] It's a decimal character reference. Look for [0-9]*[;]? following. Insert the result as a character. &[A-Za-z] It's an entity reference. Look for [-_.:A-Za-z0-9]*[;]? following. Push the named entity on the stack. &[^#A-Za-z] It's a literal ampersand. Insert the ampersand as a character. Push the following characters back for rescanning. < It's a PI, comment, declaration, or tag. following. It's an empty end-tag. Close the top element on the stack, whatever it is. following. Process it as such. <> It's an empty start-tag. Use last_tag. If that is NULL, this must be the first tag, and that may not be empty, so use "#EMPTY" instead. <[A-Za-z] It's an open tag. Look for [-_.:A-Za-z0-9]* following, then process attrs. <[^?!/A-Za-z] It's a literal less than sign. Insert it as a character. Push the following character back for rescanning. [^&<] It's a literal character (including >). Insert it as a character. If it was a \n, increment the line number. At the moment, I only want to deal with one kind of declaration, and I'll allow it out of its proper place, and it will be wrong. Input will be managed as a stack. Each element will have - name (file name for files, entity name for entities) - a file descriptor (negative for entities) - a buffer (pointer to the beginning) - a current pointer - a line number */ #include #include #include #include #include #include #include #include "kind.h" #include "qh.h" #ifdef WIDE_CHARS static size_t ustrlen(wchar const *x) { wchar const *s; for (s = x; *s++ != 0; ) {} return (s-x)-1; } static void ustrcpy(wchar *d, wchar const *s) { while ((*d++ = *s++) != '\0') {} } static int ustrcmp(wchar const *x, wchar const *y) { wchar c; while ((c = *x++) == *y++) if (c == 0) return 0; return c < y[-1] ? -1 : 1; } static void convert(wchar *d, unsigned char const *s, size_t n) { while (n-- != 0) *d++ = *s++; } #else #define ustrcmp(x,y) strcmp((char const*)(x), (char const*)(y)) #define ustrlen(x) strlen((char const*)(x)) #define ustrcpy(d, s) (void)strcpy((char *)(d), (char const *)(s)) #define convert(d, s, n) (void)memcpy(d, s, n) #endif static wchar * ustrdup(wchar *s) { size_t n = ustrlen(s) + 1; wchar *r = malloc(sizeof (wchar) * n); if (r == 0) quit("memory ran out"); (void)memcpy(r, s, sizeof (wchar) * n); return r; } static wchar * dupcvt(char const *s) { size_t n = strlen(s) + 1; wchar *r = malloc(sizeof (wchar) * n); if (r == 0) quit("memory ran out"); convert(r, (unsigned char const *)s, n); return r; } static void uputs(FILE *f, wchar const *s) { wchar c; while ((c = *s++) != '\0') putc(c, f); } /* All the characters we give to the back end are held in a single giant array. The contents of this array have the layout {name,{name,value}*}*[character data][entity name] If we suppose that - tags might be nested 1000 deep - each tag might have 8 attributes - values might be 80-character file names - names are 15 characters each then we need 1000*(15+1 + 8*(15+1+80+1)) = 792,000 characters. If we also want room for 1MB of character data (not impossible, I have such a file) then all things considered about 2MB seems about right. */ static wchar bigbuf[2*1024*1024]; #define MAX_NAME 2000 #define MAX_ENTS 300 struct Entity { wchar *name; wchar *value; int is_cdata; }; static struct Entity entity_dict[MAX_ENTS]; static int nents; #define MAX_STACK 100 struct Source { wchar *curr; wchar *buff; wchar *name; int fd; int lino; }; static struct Source source_stack[MAX_STACK]; static int ssp = 0; static int sslock = 0; #define lock_source_stack sslock = ssp #define unlock_source_stack sslock = 0 static int status = EXIT_SUCCESS; #define MAX_TAGS 1000 static struct ROKXML_binding tag_stack[MAX_TAGS]; static int tsp = 0; static wchar last_tag[128] = {'#','E','M','P','T','Y',0}; #define char_ent(n, s) {{n,0}, s} static struct {char const *value; char const *name;} char_ents[] = { {"&", "amp"}, {"<", "lt"}, {">", "gt"}, {"'", "apos"}, {"\"", "quot"}, {"\240", "nbsp"}, {"\241", "iexcl"}, {"\242", "cent"}, {"\243", "pound"}, {"\244", "curren"}, {"\245", "yen"}, {"\246", "brvbar"}, {"\247", "sect"}, {"\250", "uml"}, {"\251", "copy"}, {"\252", "ordf"}, {"\253", "laquo"}, {"\254", "not"}, {"\255", "shy"}, {"\256", "reg"}, {"\257", "macr"}, {"\260", "deg"}, {"\261", "plusmn"}, {"\262", "sup2"}, {"\263", "sup3"}, {"\264", "acute"}, {"\265", "micro"}, {"\266", "para"}, {"\267", "middot"}, {"\270", "cedil"}, {"\271", "sup1"}, {"\272", "ordm"}, {"\273", "raquo"}, {"\274", "frac14"}, {"\275", "frac12"}, {"\276", "frac34"}, {"\277", "iquest"}, {"\300", "Agrave"}, {"\301", "Aacute"}, {"\302", "Acirc"}, {"\303", "Atilde"}, {"\304", "Auml"}, {"\305", "Aring"}, {"\306", "AElig"}, {"\307", "Ccedil"}, {"\310", "Egrave"}, {"\311", "Eacute"}, {"\312", "Ecirc"}, {"\313", "Euml"}, {"\314", "Igrave"}, {"\315", "Iacute"}, {"\316", "Icirc"}, {"\317", "Iuml"}, {"\320", "ETH"}, {"\321", "Ntilde"}, {"\322", "Ograve"}, {"\323", "Oacute"}, {"\324", "Ocirc"}, {"\325", "Otilde"}, {"\326", "Ouml"}, {"\327", "times"}, {"\330", "Oslash"}, {"\331", "Ugrave"}, {"\332", "Uacute"}, {"\333", "Ucirc"}, {"\334", "Uuml"}, {"\335", "Yacute"}, {"\336", "THORN"}, {"\337", "szlig"}, {"\340", "agrave"}, {"\341", "aacute"}, {"\342", "acirc"}, {"\343", "atilde"}, {"\344", "auml"}, {"\345", "aring"}, {"\346", "aelig"}, {"\347", "ccedil"}, {"\350", "egrave"}, {"\351", "eacute"}, {"\352", "ecirc"}, {"\353", "euml"}, {"\354", "igrave"}, {"\355", "iacute"}, {"\356", "icirc"}, {"\357", "iuml"}, {"\360", "eth"}, {"\361", "ntilde"}, {"\362", "ograve"}, {"\363", "oacute"}, {"\364", "ocirc"}, {"\365", "otilde"}, {"\366", "ouml"}, {"\367", "divide"}, {"\370", "oslash"}, {"\371", "ugrave"}, {"\372", "uacute"}, {"\373", "ucirc"}, {"\374", "uuml"}, {"\375", "yacute"}, {"\376", "thorn"}, {"\377", "yuml"}, {"``", "ldquo"}, {"''", "rdquo"}, {"`", "lsquo"}, {"\'", "rsquo"}, {"--", "mdash"}, {"...", "hellip"}, {"pi", "pi"}, {"~=", "ne"}, {"<=", "le"}, {">=", "ge"}, {":", "vellip"}, {"[forall]", "forall"}, {"", ""} }; static void load_char_ents(void) { for (nents = 0; char_ents[nents].value[0] != 0; nents++) { entity_dict[nents].name = dupcvt(char_ents[nents].name); entity_dict[nents].value = (wchar *)char_ents[nents].value, entity_dict[nents].is_cdata = 1; } } static void say_where(void) { int i; for (i = 0; i < ssp; i++) { fprintf(stderr, source_stack[i].fd < 0 ? "Entity " : "File "); uputs(stderr, source_stack[i].name); fprintf(stderr, ", line %d\n", source_stack[i].lino); } for (i = 0; i < tsp; i++) { fprintf(stderr, "%.*s<", (i <= 40 ? i : 40), " "); uputs(stderr, tag_stack[i].name); fprintf(stderr, " ...>\n"); } status = EXIT_FAILURE; } void quit(char const *msg) { say_where(); fprintf(stderr, "Error: %s\n", msg); exit(status); } void quit1(char const *msg, wchar const *arg1) { say_where(); fprintf(stderr, "Error "); fprintf(stderr, msg, (char const *)arg1); fprintf(stderr, "\n"); exit(status); } void quit2(char const *msg, wchar const *arg1, wchar const *arg2) { say_where(); fprintf(stderr, "Error "); fprintf(stderr, msg, (char const *)arg1, (char const *)arg2); fprintf(stderr, "\n"); exit(status); } void warn(char const *msg) { int e = status; say_where(); fprintf(stderr, "Warning: %s\n", msg); status = e; } static int unknown_entities_are_ok = 0; static struct Entity * find_entity(wchar *name) { struct Entity *p = &entity_dict[0]; struct Entity *e = &p[nents]; for (; p != e; p++) if (ustrcmp(p->name, name) == 0) return p; if (!unknown_entities_are_ok) { say_where(); uputs(stderr, name); fprintf(stderr, " is not a known entity\n"); } return NULL; } static wchar * push_entity(struct Entity *ent, wchar *curr) { struct Source *p = &source_stack[ssp]; if (ssp > 0) p[-1].curr = curr; if (ssp == MAX_STACK) quit("source stack overflowed"); p->name = ent->name; p->buff = ent->value; p->curr = p->buff; p->lino = 1; p->fd = -1; ssp++; return p->curr; } #define BUFF_SIZE (16*1024) #define MAX_BUFFS (16*1024) static wchar * push_file(char const *name, wchar *curr) { struct Source *p = &source_stack[ssp]; int n; static unsigned char *buffer[MAX_BUFFS]; int buflen[MAX_BUFFS]; int totlen; int bsp, i; wchar *d; if (ssp > 0) p[-1].curr = curr; if (ssp == MAX_STACK) quit("source stack overflowed"); p->name = dupcvt(name); p->lino = 1; if (strcmp(name, "-") == 0 || strcmp(name, "/dev/stdin") == 0) { p->fd = 0; } else { p->fd = open(name, O_RDONLY, 0); if (p->fd < 0) { perror(name); quit("file could not be opened"); } } totlen = 0; for (bsp = 0; ; bsp++) { if (buffer[bsp] == NULL) { buffer[bsp] = malloc(BUFF_SIZE); if (buffer[bsp] == 0) quit("file buffer memory ran out"); } n = read(p->fd, buffer[bsp], BUFF_SIZE); if (n <= 0) break; buflen[bsp] = n; totlen += n; } if (n < 0) { perror(name); quit("input error"); } if (p->fd != 0) (void) close(p->fd); d = malloc(sizeof (wchar) * (totlen+1)); if (d == 0) quit("file buffer memory ran out"); p->curr = p->buff = d; for (i = 0; i < bsp; i++) { n = buflen[i]; convert(d, buffer[i], n); d += n; } *d = 0; ssp++; return p->curr; } static wchar * pop_source(void) { struct Source *p; if (ssp == 0) quit("source stack underflowed"); if (ssp == sslock) quit("entity ended within value literal"); p = &source_stack[--ssp]; if (p->fd >= 0) free(p->buff); return ssp == 0 ? NULL : source_stack[ssp-1].curr; } static struct ROKXML_handlers const *h; #if 0 static void process_PI(wchar *pi) { wchar *p, *pe; /* "name" part */ wchar *q, *qe; /* rest, minus leading/trailing spaces */ pe = p = pi; if (is_alpha(*pe)) do pe++; while (is_alnum(*pe)); q = pe; while (is_white(*q)) q++; qe = q; while (*qe != '\0') qe++; while (qe != q && is_white(qe[-1])) qe--; #ifndef PARSE_ONLY /* ESIS just wants the whole string */ write_string('?', pi); /*printf("PI name='%.*s' value = '%.*s'\n", (int)(pe-p), (char*)p, (int)(qe-q), (char*)q);*/ #endif } #endif static void free_bindings(struct ROKXML_binding *list) { struct ROKXML_binding *p, *q; for (p = list; p != 0; p = q) { q = p->next; if (p->name != 0) free(p->name); if (p->value != 0) free(p->value); } } static void pop_tag_stack(void) { struct ROKXML_binding *t = &tag_stack[--tsp]; free_bindings(t->next); ustrcpy(last_tag, t->name); } static void process_end_tag(wchar *name) { if (tsp == 0) { quit1("end tag with no open elements", name); } else { struct ROKXML_binding *t = &tag_stack[tsp-1]; if (*name != 0 && ustrcmp(name, t->name) != 0) { quit2("<%s> is open but found instead", t->name, name); } else { h->end(t->name, t->next); tsp--; free_bindings(t->next); ustrcpy(last_tag, t->name); } } } static void begin_start_tag(wchar *name) { if (tsp == MAX_TAGS) quit("element stack overflowed"); if (name == 0 || *name == '\0') name = last_tag; if (name == 0 || *name == '\0') quit("<> is not legal without a preceding element\n"); tag_stack[tsp].name = ustrdup(name); tag_stack[tsp].next = NULL; if (last_tag[0] == '#') ustrcpy(last_tag, tag_stack[tsp].name); } static void add_binding_for(wchar *name) { struct ROKXML_binding *t = &tag_stack[tsp]; struct ROKXML_binding *r; r = malloc(sizeof *r); if (r == 0) quit("memory ran out"); r->next = t->next; r->name = ustrdup(name); r->value = NULL; t->next = r; } static void set_binding_value(wchar *value) { tag_stack[tsp].next->value = ustrdup(value); } static struct ROKXML_binding * merge(struct ROKXML_binding *a, struct ROKXML_binding *b) { struct ROKXML_binding *r = NULL; struct ROKXML_binding **e = &r; struct ROKXML_binding *t; int c; while (a != NULL && b != NULL) { c = ustrcmp(a->name, b->name); if (c < 0) { *e = a; e = &a->next; a = *e; } else if (c > 0) { *e = b; e = &b->next; b = *e; } else { say_where(); if (ustrcmp(a->value, b->value) == 0) { fprintf(stderr, "Duplicate bindings for "); uputs(stderr, a->name); fprintf(stderr, "\n"); } else { fprintf(stderr, "Conflicting bindings for "); uputs(stderr, a->name); fprintf(stderr, "; later one taken\n"); } t = a; a = t->next; free(t->name); free(t->value); *e = b; e = &b->next; b = *e; } } *e = a != NULL ? a : b; return r; } static struct ROKXML_binding * msort(struct ROKXML_binding *a) { struct ROKXML_binding *p, *q; if (a == NULL) return a; p = q = a; while ((p = p->next) != NULL && p->next != NULL) q = q->next, p = p->next; /* Now q is half way along */ p = q->next; q->next = NULL; return p == NULL ? a : merge(msort(a), msort(p)); } static void process_start_tag( void (*handler)(wchar const *, struct ROKXML_binding *) ) { struct ROKXML_binding *t = &tag_stack[tsp]; tsp++; t->next = msort(t->next); handler(t->name, t->next); } static wchar * parse_string(wchar *curr, int q, wchar *d, wchar *e) { wchar c; static char oflo[] = "value buffer overflow"; # define deposit(c) if (d == e) quit(oflo); else *d++ = c #ifdef WARN_ABOUT_LT #define LAST_SPECIAL '<' #else #define LAST_SPECIAL '&' #endif lock_source_stack; if (q == 0) { /* legal SGML, not legal XML */ while (is_alnum(c = *curr++)) deposit(c); curr--; } else { for (;;) { c = *curr++; if (c > LAST_SPECIAL) { deposit(c); } else if (c <= ' ') { if (c == ' ') { deposit(c); } else if (c == '\0') { curr = pop_source(); if (curr == NULL) quit("locking broke"); } else { #ifdef GENERAL_LINE_END if (c == '\r') { if (*curr == '\n') curr++; c = '\n'; } #endif if (c == '\n') source_stack[ssp-1].lino++; else if (c != '\t') quit("non-layout control char"); deposit(' '); } } else if (c == q) { break; } else if (c == '&') { #ifdef GENERAL_LINE_END # define end_ref { \ if (c == '\r') \ if (*curr == (c = '\n') curr++; \ if (c == '\n') source_stack[ssp-1].lino++; \ else if (c != ';') curr--; \ } #else # define end_ref { \ if (c == '\n') source_stack[ssp-1].lino++; \ else if (c != ';') curr--; \ } #endif c = *curr++; if (c == '#') { c = *curr++; if (c == 'x' || c == 'X') { int n = 0; while (is_hexit(c = *curr++)) n = n*16 + kind[c]; end_ref deposit(n); } else if (c >= '0' && c <= '9') { int n = 0; do { n = n*10 + (c - '0'); c = *curr++; } while (c >= '0' && c <= '9'); end_ref deposit(n); } else { /* &#? is not a character reference */ say_where(); fprintf(stderr, "malformed character reference &#%c\n", c); curr -= 2; deposit('&'); } } else if (is_alpha(c)) { struct Entity *y; wchar *t; t = d; do { if (t == e) quit("name buffer overflow"); *t++ = c; } while (is_alnum(c = *curr++)); end_ref *t = 0; y = find_entity(d); if (y == NULL) { t = d; while ((c = *t++) != '\0') deposit(c); } else if (y->is_cdata) { t = y->value; while ((c = *t++) != '\0') deposit(c); } else { curr = push_entity(y, curr); } } else { /* &? is plain text and is legal */ curr -= 1; deposit('&'); } # undef end_ref } else { #ifdef WARN_ABOUT_LT if (c == '<') warn("XML 1.0 pointlessly forbids '<' here"); #endif deposit(c); } } } unlock_source_stack; *d = 0; return curr; # undef deposit } static int is_prefix(char const *pat, wchar const *src) { wchar c; while ((c = *(unsigned char *)pat++) != '\0') if (c != *src++) return 0; return 1; } static void parse(void) { wchar *curr = source_stack[0].curr; wchar c; wchar * const e = &bigbuf[sizeof bigbuf/sizeof bigbuf[0]]; wchar * b = bigbuf; wchar *d = b; static char oflo[] = "string buffer overflowed"; # define data_character(c) { \ if (d == e) quit(oflo); \ *d++ = c; \ } # define flush_data_buff { \ if (d != b && tsp != 0) { \ *d = '\0'; \ h->text(b); \ } \ d = b; \ } # define increment_line_number source_stack[ssp-1].lino++ # define deblank while (is_white(c)) { \ if (c == '\n') increment_line_number; \ c = *curr++; \ } for (;;) { c = *curr++; if (c == '\0') { curr = pop_source(); if (curr == NULL) { flush_data_buff return; } } else if (c == '&') { # define end_ref { \ if (c == '\n') increment_line_number; \ else if (c != ';') curr--; \ } c = *curr++; if (c == '#') { c = *curr++; if (c == 'x' || c == 'X') { wchar n = 0; while (is_hexit(c = *curr++)) n = n*16 + kind[c]; end_ref data_character(n); } else if (c >= '0' && c <= '9') { wchar n = 0; do { n = n*10 + (c - '0'); c = *curr++; } while (c >= '0' && c <= '9'); end_ref data_character(n); } else { /* &#? is not a character reference */ say_where(); fprintf(stderr, "malformed character reference &#%c\n", c); curr -= 2; data_character('&'); } } else if (is_alpha(c)) { struct Entity *y; wchar * const old_d = d; d++; /* make room for \0 if we need to flush */ do { data_character(c); } while (is_alnum(c = *curr++)); end_ref *d = 0; d = old_d; y = find_entity(old_d + 1); if (y == NULL) { flush_data_buff h->entity(old_d + 1); } else if (y->is_cdata) { wchar *s = y->value; while ((c = *s++) != '\0') data_character(c); } else { curr = push_entity(y, curr); } } else { /* &? is plain text and is legal */ curr -= 1; data_character('&'); } # undef end_ref } else if (c != '<') { if (c == '\n') increment_line_number; data_character(c); } else if ((c = *curr++) == '?') { flush_data_buff for (;;) { c = *curr++; if (c == '?' && *curr == '>') { curr++; break; } if (c == '\n') increment_line_number; data_character(c); } *d = 0; for (d = b; *d > ' '; d++) {} if (*d == ' ') *d++ = '\0'; else d = (wchar *)0; h->pi(b, d); d = b; } else if (c == '!') { c = *curr++; if (c == '[') { /* SGML has cdata != 0) flush_data_buff curr += sizeof "CDATA[" - 1; for (;;) { c = *curr++; if (c == ']' && curr[0] == ']' && curr[1] == '>') break; data_character(c); } if (h->cdata != 0) { *d = 0; h->cdata(b); d = b; } curr += 2; } else if (c == '-') { if (h->comment != 0) { do { if (curr[0] != '-') quit("comment(b); d = b; curr++; c = *curr++; deblank } while (c == '-'); } else { do { if (curr[0] != '-') quit("') quit("