/* demo.c -- Implementations of multikey quicksort and ternary search trees Usage demo Run basic timings on /usr/dict/words demo Run basic timings on demo trysearch Interactive pm and nn search on demo nncost Run near neigbhor expers on demo pmcost Interactive partial match expers on SEE ternary.pdf in the same directory. */ #include #include #include #include /* SAFE MALLOC */ static void *emalloc(size_t n) { void *r = malloc(n); if (r == 0) abort(); return r; } /* MULTIKEY QUICKSORT */ #ifndef min #define min(a, b) ((a)<=(b) ? (a) : (b)) #endif #define swap(a, b) { char *t = x[a]; x[a] = x[b]; x[b] = t; } #define i2c(i) x[i][depth] void vecswap(int i, int j, int n, char *x[]) { while (n-- > 0) { swap(i, j); i++; j++; } } void ssort1(char *x[], int n, int depth) { int a, b, c, d, r, v; if (n <= 1) return; a = rand() % n; swap(0, a); v = i2c(0); a = b = 1; c = d = n - 1; for (;;) { while (b <= c && (r = i2c(b) - v) <= 0) { if (r == 0) { swap(a, b); a++; } b++; } while (b <= c && (r = i2c(c) - v) >= 0) { if (r == 0) { swap(c, d); d--; } c--; } if (b > c) break; swap(b, c); b++; c--; } r = min(a, b - a); vecswap(0, b - r, r, x); r = min(d - c, n - d - 1); vecswap(b, n - r, r, x); r = b - a; ssort1(x, r, depth); if (i2c(r) != 0) ssort1(x + r, a + n - d - 1, depth + 1); r = d - c; ssort1(x + n - r, r, depth); } void ssort1main(char *x[], int n) { ssort1(x, n, 0); } /* ssort2 -- Faster Version of Multikey Quicksort */ void vecswap2(char **a, char **b, int n) { while (n-- > 0) { char *t = *a; *a++ = *b; *b++ = t; } } #define swap2(a, b) { t = *(a); *(a) = *(b); *(b) = t; } #define ptr2char(i) (*(*(i) + depth)) char **med3func(char **a, char **b, char **c, int depth) { int va, vb, vc; if ((va = ptr2char(a)) == (vb = ptr2char(b))) return a; if ((vc = ptr2char(c)) == va || vc == vb) return c; return va < vb ? (vb < vc ? b : (va < vc ? c : a)) : (vb > vc ? b : (va < vc ? a : c)); } #define med3(a, b, c) med3func(a, b, c, depth) void inssort(char **a, int n, int d) { char **pi, **pj, *s, *t; for (pi = a + 1; --n > 0; pi++) { for (pj = pi; pj > a; pj--) { /* Inline strcmp: break if *(pj-1) <= *pj */ for ( s = *(pj - 1) + d, t = *pj + d ; *s == *t && *s != 0 ; s++, t++ ) { } if (*s <= *t) break; swap2(pj, pj - 1); } } } void ssort2(char **a, int n, int depth) { int d, r, partval; char **pa, **pb, **pc, **pd, **pl, **pm, **pn, *t; if (n < 10) { inssort(a, n, depth); return; } pl = a; pm = a + (n / 2); pn = a + (n - 1); if (n > 30) { /* On big arrays, pseudomedian of 9 */ d = (n / 8); pl = med3(pl, pl + d, pl + 2 * d); pm = med3(pm - d, pm, pm + d); pn = med3(pn - 2 * d, pn - d, pn); } pm = med3(pl, pm, pn); swap2(a, pm); partval = ptr2char(a); pa = pb = a + 1; pc = pd = a + n - 1; for (;;) { while (pb <= pc && (r = ptr2char(pb) - partval) <= 0) { if (r == 0) { swap2(pa, pb); pa++; } pb++; } while (pb <= pc && (r = ptr2char(pc) - partval) >= 0) { if (r == 0) { swap2(pc, pd); pd--; } pc--; } if (pb > pc) break; swap2(pb, pc); pb++; pc--; } pn = a + n; r = min(pa - a, pb - pa); vecswap2(a, pb - r, r); r = min(pd - pc, pn - pd - 1); vecswap2(pb, pn - r, r); if ((r = pb - pa) > 1) ssort2(a, r, depth); if (ptr2char(a + r) != 0) ssort2(a + r, (int)(pa - a + pn - pd - 1), depth + 1); if ((r = pd - pc) > 1) ssort2(a + n - r, r, depth); } void ssort2main(char **a, int n) { ssort2(a, n, 0); } /* TERNARY SEARCH TREE ALGS */ typedef struct tnode *Tptr; typedef struct tnode { char splitchar; Tptr lokid, eqkid, hikid; } Tnode; Tptr root; /* Insert 1 -- Simple Insertion Algorithm */ Tptr insert1(Tptr p, char *s) { if (p == 0) { p = (Tptr) emalloc(sizeof(Tnode)); p->splitchar = *s; p->lokid = p->eqkid = p->hikid = 0; } if (*s < p->splitchar) { p->lokid = insert1(p->lokid, s); } else if (*s == p->splitchar) { if (*s != 0) p->eqkid = insert1(p->eqkid, ++s); } else { p->hikid = insert1(p->hikid, s); } return p; } void cleanup1(Tptr p) { if (p) { cleanup1(p->lokid); cleanup1(p->eqkid); cleanup1(p->hikid); free(p); } } /* Insert 2 -- Faster version of Insert */ #define BUFSIZE 1000 Tptr buf; int bufn, freen; void *freearr[10000]; int storestring = 0; void insert2(char *s) { int d; char *instr = s; Tptr pp, *p; p = &root; while ((pp = *p) != 0) { if ((d = *s - pp->splitchar) == 0) { if (*s++ == 0) return; p = &(pp->eqkid); } else if (d < 0) { p = &(pp->lokid); } else { p = &(pp->hikid); } } for (;;) { /* *p = (Tptr) emalloc(sizeof(Tnode)); */ if (bufn-- == 0) { buf = (Tptr) emalloc(BUFSIZE * sizeof(Tnode)); freearr[freen++] = (void *) buf; bufn = BUFSIZE - 1; } *p = buf++; pp = *p; pp->splitchar = *s; pp->lokid = pp->eqkid = pp->hikid = 0; if (*s++ == 0) { if (storestring) pp->eqkid = (Tptr) instr; return; } p = &(pp->eqkid); } } void cleanup2(void) { int i; for (i = 0; i < freen; i++) free(freearr[i]); } /* Search Algorithms */ int search1(char *s) { Tptr p; p = root; while (p) { if (*s < p->splitchar) { p = p->lokid; } else if (*s == p->splitchar) { if (*s++ == 0) return 1; p = p->eqkid; } else { p = p->hikid; } } return 0; } int search2(char *s) { int d, sc; Tptr p; sc = *s; p = root; while (p) { if ((d = sc - p->splitchar) == 0) { if (sc == 0) return 1; sc = *++s; p = p->eqkid; } else if (d < 0) { p = p->lokid; } else { p = p->hikid; } } return 0; } /* Advanced searching: Partial match, near words */ int nodecnt; char *srcharr[100000]; int srchtop; void pmsearch(Tptr p, char *s) { if (!p) return; nodecnt++; if (*s == '.' || *s < p->splitchar) pmsearch(p->lokid, s); if (*s == '.' || *s == p->splitchar) if (p->splitchar && *s) pmsearch(p->eqkid, s + 1); if (*s == 0 && p->splitchar == 0) srcharr[srchtop++] = (char *) p->eqkid; if (*s == '.' || *s > p->splitchar) pmsearch(p->hikid, s); } void nearsearch(Tptr p, char *s, int d) { if (!p || d < 0) return; nodecnt++; if (d > 0 || *s < p->splitchar) nearsearch(p->lokid, s, d); if (p->splitchar == 0) { if ((int) strlen(s) <= d) srcharr[srchtop++] = (char *) p->eqkid; } else { nearsearch(p->eqkid, *s ? s + 1 : s, (*s == p->splitchar) ? d : d - 1); } if (d > 0 || *s > p->splitchar) nearsearch(p->hikid, s, d); } /* OTHER SET ALGORITHMS AND DATA STRUCTURES */ /* Private scmp to avoid tuned library function */ int scmp(char *s, char *t) { for (; *s == *t; s++, t++) if (*s == 0) return 0; return *s - *t; } /* Binary search */ int sbsearch(char *key, char **a, int n) { int m; char *s, *t; while (n >= 1) { m = n / 2; for (s = key, t = a[m]; *s == *t; s++, t++) if (*s == 0) return 1; if (*s < *t) { n = m; } else { a = a + m + 1; n = n - m - 1; } } return 0; } /* Hashing */ typedef struct hnode *Hptr; typedef struct hnode { char *str; Hptr next; } Hnode; Hptr *hashtab; int tabsize; int hashfunc(char *s) { unsigned n = 0; for (; *s; s++) n = 31 * n + *s; return n % tabsize; } void hbuild(char **a, int n) { int i, j; Hptr p; tabsize = n; hashtab = (Hptr *) emalloc(tabsize * sizeof(Hptr)); for (i = 0; i < tabsize; i++) hashtab[i] = 0; for (i = 0; i < tabsize; i++) { j = hashfunc(a[i]); p = (Hptr) emalloc(sizeof(Hnode)); p->str = a[i]; p->next = hashtab[j]; hashtab[j] = p; } } int hsearch(char *s) { Hptr p; for (p = hashtab[hashfunc(s)]; p; p = p->next) if (scmp(s, p->str) == 0) return 1; return 0; } int hsearch2(char *ins) { Hptr p; char *s, *t; for (p = hashtab[hashfunc(ins)]; p; p = p->next) { /* if (scmp(ins, p->str) == 0) */ /* return 1; */ for (s = ins, t = p->str; *s == *t; s++, t++) if (*s == 0) return 1; } return 0; } int hsearch3(char *s) { Hptr p; for (p = hashtab[hashfunc(s)]; p; p = p->next) if (strcmp(s, p->str) == 0) return 1; return 0; } /* Radix Sort -- McIlroy, Bostic and McIlroy */ #define SIZE 510 #define THRESHOLD 16 typedef char *string; typedef struct { string *sa; int sn, si; } rstack_t; #define rswap(p, q, r) r = p, p = q, q = r void simplesort(string *a, int n, int b) { /* insertion sort */ string *ak, *ai, s, t; for (ak = a + 1; --n >= 1; ak++) { for (ai = ak; ai > a; ai--) { for (s = ai[0] + b, t = ai[-1] + b; *s; s++, t++) if (*s != *t) break; if (*s >= *t) break; rswap(ai[0], ai[-1], s); } } } #define push(a, n, i) sp->sa = a, sp->sn = n, (sp++)->si = i #define pop(a, n, i) a = (--sp)->sa, n = sp->sn, i = sp->si #define stackempty() (sp <= stack) void rsorta(string * a, int n, int b) { rstack_t stack[SIZE], stmp, *oldsp, *bigsp, *sp = stack; string *pile[256], *ak, *an, r, t; static int count[256], cmin, nc; int c, *cp; push(a, n, b); while (!stackempty()) { pop(a, n, b); if (n < THRESHOLD) { /* divert */ simplesort(a, n, b); continue; } an = a + n; if (nc == 0) { /* nonrecursive? */ cmin = 255; /* tally */ for (ak = a; ak < an;) { c = (*ak++)[b]; if (++count[c] == 1 && c) { if (c < cmin) cmin = c; nc++; } } if (sp + nc > stack + SIZE) { /* stack overflow */ rsorta(a, n, b); continue; } } oldsp = bigsp = sp, c = 2; /* logarithmic stack */ pile[0] = ak = a + count[0]; /* find places */ for (cp = count + cmin; nc > 0; cp++, nc--) { while (*cp == 0) cp++; if (*cp > 1) { if (*cp > c) c = *cp, bigsp = sp; push(ak, *cp, b + 1); } pile[cp - count] = ak += *cp; } rswap(*oldsp, *bigsp, stmp); /* permute home */ for (ak = a; ak < an; ak += count[c], count[c] = 0) { r = *ak; while (--pile[c = r[b]] > ak) rswap(*pile[c], r, t); *ak = r; } /* here nc = count[...] = 0 */ } } void rsort(string * a, int n) { rsorta(a, n, 0); } /* TIMING */ /* Sort support functions */ void scramble(char *x[], int n) { int i; for (; n > 0; n--) { i = rand() % (n + 1); swap(n, i); } } int qscomp(const void *p, const void *q) { return strcmp(*(char **) p, *(char **) q); } #define DOQSORT qsort((void *) a, (size_t) n, sizeof(char *), qscomp) /* Insertion support functions */ int instype; void rinsall(char **a, int n) { int m; if (n < 1) return; m = n / 2; switch (instype) { case 1: root = insert1(root, a[m]); break; case 2: insert2(a[m]); break; case 9: break; } rinsall(a, m); rinsall(a + m + 1, n - m - 1); } void insall(char **a, int n) { switch (instype) { case 1: root = 0; rinsall(a, n); break; case 2: root = 0; bufn = freen = 0; rinsall(a, n); break; case 9: rinsall(a, n); break; } } /* Search support functions */ int searchtype; void searchall(char **a, int n) { int i, j; char s[100]; for (i = 0; i < n; i++) { strcpy(s, a[i]); /* s[0]++; // Uncomment for unsuccessful searches */ switch (searchtype) { case 1: j = search1(s); break; case 2: j = search2(s); break; case 3: srchtop = 0; pmsearch(root, s); j = srchtop; break; case 4: srchtop = 0; nearsearch(root, s, 0); j = srchtop; break; case 5: j = sbsearch(s, a, n); break; case 6: j = hsearch(s); break; case 7: j = hsearch2(s); break; case 8: j = hsearch3(s); break; case 9: j = 1; break; } if (j != 1) /* Comment for unsuccessful searches */ printf("search bug at %d, val %d\n", i, j); } } /* Main timing */ int n = 0, starttime, rptctr; #define TASK(msg) printf("%s", msg); #define CIN starttime = clock(); #define COUT printf("\t%d", clock()-starttime); #define NL printf("\n") #define REPEAT(s) for (rptctr = 0; rptctr < 5; rptctr++) { s }; NL; char space[1000000], *a[100000]; void timeall(void) { TASK("System Qsort") REPEAT(scramble(a, n); CIN; DOQSORT; COUT;) TASK("Simple MKQSort"); REPEAT(scramble(a, n); CIN; ssort1main(a, n); COUT;) TASK("Faster MKQSort"); REPEAT(scramble(a, n); CIN; ssort2main(a, n); COUT;) TASK("Radix Sort"); REPEAT(scramble(a, n); CIN; rsort(a, n); COUT;) TASK("Null Insert"); instype = 9; REPEAT(CIN; insall(a, n); COUT;) TASK("Simp TST Insert"); instype = 1; REPEAT(CIN; insall(a, n); COUT; cleanup1(root);) TASK("Fast TST Insert"); instype = 2; REPEAT(CIN; insall(a, n); COUT; cleanup2();) TASK("Null Search"); searchtype = 9; REPEAT(CIN; searchall(a, n); COUT;) instype = 2; insall(a, n); /* Build TST for searching */ TASK("Simp TST Search"); searchtype = 1; REPEAT(CIN; searchall(a, n); COUT;) TASK("Fast TST Search"); searchtype = 2; REPEAT(CIN; searchall(a, n); COUT;) TASK("PM TST Search"); searchtype = 3; REPEAT(CIN; searchall(a, n); COUT;) TASK("Near TST Search"); searchtype = 4; REPEAT(CIN; searchall(a, n); COUT;) cleanup2(); /* Remove TST */ TASK("Binary Search"); searchtype = 5; REPEAT(CIN; searchall(a, n); COUT;) TASK("Build Hash"); CIN; hbuild(a, n); /* Build once -- no cleanup */ COUT; NL; TASK("Hash Search"); searchtype = 6; REPEAT(CIN; searchall(a, n); COUT;) TASK("Hash, Inln Cmp"); searchtype = 7; REPEAT(CIN; searchall(a, n); COUT;) TASK("Hash, Sys Cmp"); searchtype = 8; REPEAT(CIN; searchall(a, n); COUT;) /* Hash table is still using space */ } void buildptrtree(void) { TASK("Building TST"); CIN; storestring = 1; /* Sleazy hack: store strings in eqkid */ ssort2main(a, n); instype = 2; insall(a, n); COUT; NL; } void trysearch(void) { char s[100]; int i, d; buildptrtree(); printf("Enter searches: (dist=-1 for pm search)\n"); while (scanf("%d %s", &d, s) != EOF) { srchtop = 0; nodecnt = 0; CIN; if (d < 0) pmsearch(root, s); else nearsearch(root, s, d); printf(" matches=%d nodes=%d clicks=", srchtop, nodecnt); COUT; NL; for (i = 0; i < min(srchtop, 40); i++) printf(" %s", srcharr[i]); NL; } } void nncost(void) { int i, d, mincost, minind, maxcost, maxind; double sum; buildptrtree(); for (d = 0; d <= 4; d++) { CIN; mincost = 999999; maxcost = -1; sum = 0.0; for (i = 0; i < n; i++) { srchtop = 0; nodecnt = 0; nearsearch(root, a[i], d); sum += nodecnt; if (nodecnt <= mincost) { mincost = nodecnt; minind = i; } if (nodecnt >= maxcost) { maxcost = nodecnt; maxind = i; } } printf("Dist %d\t%d %s\t%d %s\t%g", d, mincost, a[minind], maxcost, a[maxind], sum / n); COUT; NL; } } void pmcost(void) { int i, j, l, u, mincost, minind, maxcost, maxind; double sum; char buf[100]; buildptrtree(); printf("Enter l, u pair\n"); while (scanf("%d %d", &l, &u) != EOF) { CIN; mincost = 999999; maxcost = -1; sum = 0.0; for (i = 0; i < n; i++) { strcpy(buf, a[i]); for (j = l; j <= u; j++) buf[j] = '.'; srchtop = 0; nodecnt = 0; pmsearch(root, buf); sum += nodecnt; if (nodecnt <= mincost) { mincost = nodecnt; minind = i; } if (nodecnt >= maxcost) { maxcost = nodecnt; maxind = i; } } printf("Range: %d,%d\t%d %s\t%d %s\t%g", l, u, mincost, a[minind], maxcost, a[maxind], sum / n); COUT; NL; } } int main(int argc, char **argv) { int i, globalstarttime; char *s = space, *t, *fname; FILE *fp; if (argc == 1) /* no args */ fname = "/usr/dict/words"; else /* one arg: file name */ fname = argv[1]; setbuf(stdout, 0); if ((fp = fopen(fname, "r")) == NULL) { fprintf(stderr, " Can't open file\n"); exit(1); } globalstarttime = clock(); TASK("Big Malloc"); CIN; t = (char *) emalloc(8000000 * sizeof(char)); free(t); COUT; NL; TASK("Reading Input"); CIN; a[0] = s; while ((i = getc(fp)) != EOF) { if (i == '\n') { *s++ = 0; a[++n] = s; } else { *s++ = i; } } COUT; NL; if (argc < 3) { /* one arg: file name */ timeall(); } else { if (strcmp(argv[2], "trysearch") == 0) { trysearch(); } else if (strcmp(argv[2], "nncost") == 0) { nncost(); } else if (strcmp(argv[2], "pmcost") == 0) { pmcost(); } else { printf("Unrecognized option\n"); } } i = clock() - globalstarttime; printf("Total clicks\t%d\nTotal secs\t%4.3f\n", i, (double) i / CLOCKS_PER_SEC); return 0; }