/* File : wbxml.c Author : Richard A. O'Keefe Updated: 06/27/01 Purpose: Implement WBXML 1.2 */ #ifndef lint static char SCCSid[] = "@(#)01/06/27 wbxml.c 1.7"; #endif/*lint*/ #include #include #include #include "wbxml.h" /* The IANA codes for known character sets */ #define IANA_ASCII 3 #define IANA_ISO_8859_1 4 #define IANA_UTF_8 106 /* WBXML byte codes */ #define SWITCH_PAGE 0x00 #define END 0x01 #define ENTITY 0x02 #define STR_I 0x03 #define LITERAL 0x04 #define EXT_I_0 0x40 #define EXT_I_1 0x41 #define EXT_I_2 0x42 #define PI 0x43 #define LITERAL_C 0x44 #define EXT_T_0 0x80 #define EXT_T_1 0x81 #define EXT_T_2 0x82 #define STR_T 0x83 #define LITERAL_A 0x84 #define EXT_0 0xC0 #define EXT_1 0xC1 #define EXT_2 0xC2 #define OPAQUE 0xC3 #define LITERAL_AC 0xC4 /* Error codes */ #define WBXML_OK 0 /* no error */ #define WBXML_TRUNCATED 1 /* input was truncated */ #define WBXML_VERSION 2 /* unsupported version number */ #define WBXML_CHARSET 3 /* unsupported character set number */ #define WBXML_PUBID 4 /* unsupported public identifier */ #define WBXML_BADDATA 5 /* bad structure */ #define WBXML_UNIMP 6 /* EXT_? and OPAQUE not implemented */ #define WBXML_TOO_BIG 7 /* too much character data at once */ #define WBXML_NO_MEM 8 /* insufficient heap memory */ static int read_multibyte_word(FILE *stream, unsigned long *result) { unsigned long r; int c; r = 0; do { c = getc(stream); if (c < 0) return 1; r = (r << 7) | ((unsigned)c & 0x7Fu); } while (((unsigned)c & 0x80u) != 0); *result = r; return 0; } #define err(n) {error_code = n; goto report_error; } #define get_byte(s, v) \ if ((c = getc(s)) >= 0) *(v) = (unsigned char)c; else err(WBXML_TRUNCATED) #define get_word(s, v) \ if (read_multibyte_word(s, v)) err(WBXML_TRUNCATED) #define NS (char *)0 static struct WBXML_dictionary *all_dictionaries[] = { #ifdef TESTING &dict1, &dict2, #endif &wapdict, #ifdef HTML &htmldict, #endif (struct WBXML_dictionary *)0 }; #define MAX_DATA (4 * 1024 * 1024) static unsigned char cstack[MAX_DATA]; static int csp = 0; #define deposit(c) \ if (csp == MAX_DATA) goto too_much_data; else cstack[csp++] = c #define MAX_TAGS 1000 struct Tag { struct ROKXML_binding *list; wchar *name; int csp; }; static struct Tag tag_stack[MAX_TAGS]; static int tsp = 0; static void free_bindings(struct ROKXML_binding *list) { struct ROKXML_binding *p, *q; for (p = list; p != 0; p = q) { q = p->next; free(p); } } static struct ROKXML_binding * msort(struct ROKXML_binding *a) { struct ROKXML_binding *p, *q, *t, **e; int c; if ((p = q = a) != 0) { while ((p = p->next) != 0 && (p = p->next) != 0) q = q->next; if ((p = q->next) != 0) { q->next = 0; q = msort(p); p = msort(a); a = 0; e = &a; while (p != 0 && q != 0) { c = strcmp((char const *)p->name, (char const *)q->name); if (c <= 0) { if (c == 0) { q = (t = q)->next; free(t); } p = *(e = &(*e = p)->next); } else { q = *(e = &(*e = q)->next); } } *e = p == 0 ? q : p; } } return a; } int wbxml(FILE *stream, struct ROKXML_handlers const *handlers) { enum { CONT_STATE, /* processing contents of element or document */ ATTR_STATE, /* processing attributes of empty element */ BOTH_STATE, /* processing attributes of non-empty element */ PI_STATE /* processing processing instruction */ } state = CONT_STATE; unsigned char version; unsigned long public_id, public_index; unsigned long charset; unsigned long strtbl_size; unsigned char *strtbl = 0; unsigned char const *pubid; unsigned char code; unsigned long temp; unsigned char *src; struct WBXML_dictionary *dictionary; unsigned char **tags, **atts; struct Tag *t; struct ROKXML_binding *l; unsigned char ch; int c; int error_code; int saved_csp; /* version */ get_byte(stream, &version); if (version != 0x02) err(WBXML_VERSION); /* public identifier (part 1) */ get_word(stream, &public_id); public_index = 0; if (public_id == 0) get_word(stream, &public_index); /* character set */ get_word(stream, &charset); if (charset != IANA_ISO_8859_1 && charset != IANA_UTF_8 && charset != IANA_ASCII ) err(WBXML_CHARSET); if (charset == IANA_ASCII) charset = IANA_UTF_8; /* String table */ get_word(stream, &strtbl_size); if (strtbl_size == 0) { strtbl = (unsigned char *)""; } else { strtbl = malloc(strtbl_size); if (strtbl == 0) err(WBXML_NO_MEM); if (fread(strtbl, 1, strtbl_size, stream) != strtbl_size) err(WBXML_TRUNCATED); } /* public identifier (part 2) */ switch ((unsigned)public_id) { case 0u: if (public_index >= strtbl_size) err(WBXML_BADDATA); pubid = strtbl + public_index; break; case 1u: pubid = 0; break; case 2u: pubid = (unsigned char const *)"-//WAPFORUM/DTD WML 1.0//EN"; break; case 4u: pubid = (unsigned char const *)"-//WAPFORUM/DTD WML 1.1//EN"; break; case 8u: pubid = (unsigned char const *)"-//WAPFORUM/DTD WML 1.2//EN"; break; /* For testing */ case 126u: pubid = (unsigned char const *)"-//WAPFORUM/DTD WBXML TEST 1//EN"; break; case 127u: pubid = (unsigned char const *)"-//WAPFORUM/DTD WBXML TEST 2//EN"; break; /* End testing */ case 132u: pubid = (unsigned char const *)"-//W3C/DTD HTML 3.2"; break; case 133u: pubid = (unsigned char const *)"-//W3C/DTD HTML 4.01 Transitional"; break; case 134u: pubid = (unsigned char const *)"-//W3C/DTD HTML 4.01 Frameset"; break; case 135u: pubid = (unsigned char const *)"-//W3C/DTD HTML 4.01 Strict"; break; default: pubid = (unsigned char *)" "; } if (pubid == (unsigned char *)0) { tags = atts = 0; } else { int i, j; for (i = 0; (dictionary = all_dictionaries[i]) != 0; i++) for (j = 0; dictionary->pubs[j] != 0; j++) if (strcmp((char *)dictionary->pubs[j], (char*)pubid) == 0) goto found; err(WBXML_PUBID); found: tags = dictionary->tags[0]; atts = dictionary->atts[0]; } handlers->start(0, pubid); /* body of document */ saved_csp = csp = 0; while ((c = getc(stream)) >= 0) { code = (unsigned char)c; while (code == SWITCH_PAGE) { get_byte(stream, &code); if (state == CONT_STATE) { tags = dictionary->tags[code]; } else { atts = dictionary->atts[code]; } get_byte(stream, &code); } switch (code) { case END: if (state != CONT_STATE) { /* end of an attribute list */ if (t->list != 0) {deposit(0);} t->list = msort(t->list); } if (tsp == 0) err(WBXML_BADDATA); t = &tag_stack[--tsp]; switch (state) { case CONT_STATE: if (csp != saved_csp) { deposit(0); handlers->text(cstack + saved_csp); } handlers->end(t->name, t->list); break; case ATTR_STATE: handlers->empty(t->name, t->list); free_bindings(t->list); break; case BOTH_STATE: handlers->begin(t->name, t->list); tsp++; break; case PI_STATE: l = t->list; if (t->name != 0 || l == 0) err(WBXML_BADDATA); handlers->pi(l->name, l->value); free_bindings(l); break; } state = CONT_STATE; saved_csp = csp = t->csp; break; case ENTITY: get_word(stream, &temp); if (temp < 128) { deposit(temp); } else if (temp < 32*64) { deposit(temp/64 + 6*32); deposit(temp%64 + 128); } else { deposit(temp/4096 + 14*16); deposit((temp/64)%64 + 128); deposit(temp%64 + 128); } break; case STR_I: while ((c = getc(stream)) > 0) { ch = (unsigned char)c; if (charset == IANA_UTF_8 || ch < 128) { deposit(ch); } else { deposit((ch>>6) + 6*62); deposit((ch&63) + 128); } } if (c < 0) err(WBXML_TRUNCATED); break; case STR_T: get_word(stream, &temp); if (temp >= strtbl_size) err(WBXML_BADDATA); src = strtbl + temp; while ((ch = *src++) != 0) { if (charset == IANA_UTF_8 || ch < 128) { deposit(ch); } else { deposit((ch>>6) + 6*62); deposit((ch&63) + 128); } } break; case EXT_I_0: case EXT_I_1: case EXT_I_2: deposit('$'); deposit('('); while ((c = getc(stream)) > 0) { ch = (unsigned char)c; if (charset == IANA_UTF_8 || ch < 128) { deposit(ch); } else { deposit((ch>>6) + 6*62); deposit((ch&63) + 128); } } goto rest_ext; case EXT_T_0: case EXT_T_1: case EXT_T_2: deposit('$'); deposit('('); get_word(stream, &temp); if (temp >= strtbl_size) err(WBXML_BADDATA); src = strtbl + temp; while ((ch = *src++) != 0) { if (charset == IANA_UTF_8 || ch < 128) { deposit(ch); } else { deposit((ch>>6) + 6*62); deposit((ch&63) + 128); } } rest_ext: { static char *escaping[3] = {":escape", ":unesc", ":noesc"}; src = (unsigned char *)escaping[code & 3]; while ((ch = *src++) != 0) { deposit(ch); } } deposit(')'); break; case EXT_0: case EXT_1: case EXT_2: case OPAQUE: /* not used in WML or HTML */ err(WBXML_UNIMP); /*NOTREACHED*/ break; case PI: if (state != CONT_STATE) err(WBXML_BADDATA); if (csp != saved_csp) { deposit(0); handlers->text(cstack + saved_csp); csp = saved_csp; } state = PI_STATE; if (tsp == MAX_TAGS) goto too_much_data; t = &tag_stack[tsp++]; t->name = 0, t->list = 0, t->csp = saved_csp = csp; break; case LITERAL: case LITERAL_C: case LITERAL_A: case LITERAL_AC: get_word(stream, &temp); if (temp >= strtbl_size) err(WBXML_BADDATA); src = strtbl + temp; goto rest_tag; default: if (state != CONT_STATE) { if (atts == 0 || (src = (unsigned char *)atts[code]) == 0) err(WBXML_BADDATA); if (code < 128) { if (t->list != 0) {deposit(0);} saved_csp = csp; l = malloc(sizeof *l); if (l == 0) err(WBXML_NO_MEM); t = &tag_stack[tsp-1]; l->next = t->list, l->name = src, l->value = cstack + csp; t->list = l; src = code == LITERAL ? (unsigned char *)"" : src + strlen((char const *)src) + 1; } while ((ch = *src++) != 0) { if (charset == IANA_UTF_8 || ch < 128) { deposit(ch); } else { deposit((ch>>6) + 6*62); deposit((ch&63) + 128); } } break; } if (tags == 0 || (src = (unsigned char *)tags[code & 63]) == 0) err(WBXML_BADDATA); rest_tag: if (csp != saved_csp) { deposit(0); handlers->text(cstack + saved_csp); csp = saved_csp; } if ((code & 0xC0) == 0) { handlers->empty(src, 0); } else { if (tsp == MAX_TAGS) goto too_much_data; t = &tag_stack[tsp++]; t->name = src, t->list = 0, t->csp = saved_csp = csp; switch (code & 0xC0) { case 0xC0: state = BOTH_STATE; break; case 0x80: state = ATTR_STATE; break; case 0x40: state = CONT_STATE; handlers->begin(src, 0); break; } } break; } } handlers->finish(); if (strtbl != 0) free(strtbl); return WBXML_OK; too_much_data: error_code = WBXML_TOO_BIG; report_error: if (strtbl != 0) free(strtbl); return error_code; }