/* File : terms.c Author : Richard A. O'Keefe. Updated: 2010 Purpose: Terms and Unification in C */ #include #include #include #include #include /* A term is - a VARIABLE + which may be UNBOUND + or BOUND to some term - or a NON-VARIABLE, which consists of a FUNCTION SYMBOL and a sequence of zero or more ARGUMENTS, which are terms. The number of arguments is called the ARITY. The combination of a function symbol and an arity, commonly written f/n, is called a FUNCTOR. A function symbol is an uninterpreted atom. Programming languages based on first order logic commonly admit numbers as a kind of non-variable term, in which case we distinguish between numbers and callable terms. An ATOMIC term is either a number or a function symbol with no arguments; a COMPOUND term is a term with 1 or more arguments. Issues: representing terms classifying terms dereferencing terms (returning something that is not a bound variable) testing whether an unbound variable occurs in a term reading terms writing terms testing whether two terms are identical binding (and trailing) a variable unifying two terms. Syntax: term : variable | atom | atom '(' term_sequence ')' | list ; term_sequence : term [ ',' term_sequence ] ; list : '[' [term_sequence ['|' term] ']' ; variable : /[_A-Z][A-Za-z0-9_]* / ; atom : /[a-z][A-Za-z0-9_]* / | /'([^']|'')*'/ ; Lists are just a handy abbreviation. [] is the same as '[]'. [t1,t2,...,tn] is the same as [t1,t2,...,tn|[]] [t1|tt] is the same as '.'(t1,tt) [t1,t2,...,tn|tt] is the same as '.'(t1,[t2,...,tn|tt]) */ /* --------------------------------------------------------------------- The string hash table holds a unique copy of each string. */ typedef struct Unique_String *ustr; typedef struct Term_Info *term; struct Unique_String { ustr next; term variable; /* only used while reading */ term atom; /* also used when reading */ size_t quotes; /* how many extra ' does it need */ size_t size; char name[4]; }; #define SEGBITS 9 #define SEGSIZE (1 << SEGBITS) typedef ustr segment[SEGSIZE]; #define MAXLOAD 4 #define DIRBITS 9 #define DIRSIZE (1 << DIRBITS) #define INIT_SEGS 8 #if INIT_SEGS > DIRSIZE #error oops #endif struct HASHTABLE { unsigned long p; /* next bucket to split */ unsigned long maxp; /* upper bound on p during expansion */ unsigned long max_load; /* maximum load factor (in keys): */ /* n_segs*SEGSIZE*MAXLOAD */ unsigned long n_keys; /* number of keys */ unsigned long n_segs; /* number of segments */ segment *directory[DIRSIZE]; }; static struct HASHTABLE string_table; static void *emalloc(size_t size) { void *result = malloc(size); if (result == 0) { perror("malloc"); exit(EXIT_FAILURE); } return result; } void init_strtab(void) { struct HASHTABLE * const h = &string_table; int i, j; h->p = 0; h->n_segs = INIT_SEGS; h->maxp = h->n_segs << SEGBITS; h->max_load = MAXLOAD * h->maxp; h->n_keys = 0; for (i = 0; i < (int)h->n_segs; i++) { segment * const s = emalloc(sizeof *s); for (j = 0; j < SEGSIZE; j++) (*s)[j] = 0; h->directory[i] = s; } for (i = i; i < DIRSIZE; i++) h->directory[i] = 0; } static unsigned long hash_string(char const *str, size_t len) { unsigned char const *p = (unsigned char const *)str; unsigned long h; size_t L; h = 0; for (L = 3; L < len; L += 4, p += 4) h = h * 31 + (p[0] + (p[1] << 3) + (p[2] << 6) + (p[3] << 9)); switch (L - len) { case 0: h += p[2] << 6; /*FALLTHROUGH*/ case 1: h += p[1] << 3; /*FALLTHROUGH*/ case 2: h += p[0]; /*FALLTHROUGH*/ default:; } return h; } static size_t count_quotes(char const *str, size_t len) { size_t i, q; if (len != 0 && islower(str[0])) { i = 1; while (i != len && (isalnum(str[i]) || str[i] == '_')) i++; if (i == len) return 0; } q = 2; for (i = 0; i != len; i++) if (str[i] == '\'') q++; return q; } ustr find_mem(char const *str, size_t len) { struct HASHTABLE * const h = &string_table; size_t const pad = ((0u-len) & ((sizeof (unsigned long)) - 1)) + 1; unsigned long const u0 = hash_string(str, len); { unsigned long const u1 = u0 & (h->maxp - 1); unsigned long const u = u1 >= h->p ? u1 : u0 & ((h->maxp << 1) - 1); ustr * const b = &(*h->directory[u>>SEGBITS])[u & (SEGSIZE - 1)]; ustr *p; /* previous */ ustr c; /* current */ for (p = b; (c = *p) != 0; p = &c->next) { if (c->size == len && 0 == memcmp(c->name, str, len)) { *p = c->next; c->next = *b; *b = c; return c; } } c = emalloc(offsetof(struct Unique_String, name) + len + pad); c->quotes = count_quotes(str, len); c->size = len; (void)memcpy(c->name, str, len); c->name[len] = '\0'; c->next = *b; *b = c; if (++h->n_keys > h->max_load) { unsigned long const oldp = h->p; unsigned long const newp = oldp + h->maxp; if (newp < DIRSIZE * (unsigned long)SEGSIZE) { segment * const olds = h->directory[h->p >> SEGBITS]; segment * news; if ((newp & (SEGSIZE-1)) == 0) { int j; news = emalloc(sizeof *news); for (j = 0; j < SEGSIZE; j++) (*news)[j] = 0; h->n_segs++; h->directory[newp >> SEGBITS] = news; } else { news = h->directory[newp >> SEGBITS]; } h->max_load += MAXLOAD; { unsigned long const mask = (h->maxp << 1) - 1; ustr *prev = &(*olds)[oldp & (SEGSIZE-1)]; ustr *tail = &(*news)[newp & (SEGSIZE-1)]; ustr curr; while ((curr = *prev) != 0) { unsigned long const slot = mask & hash_string(curr->name, curr->size); if (slot == newp) { *tail = curr; tail = &curr->next; *prev = curr->next; } else { prev = &curr->next; } } *tail = 0; } if (++h->p == h->maxp) { h->maxp <<= 1; h->p = 0; } } } return c; } } /*---------------------------------------------------------------------- We'll represent a term by a pointer to a record containing - the arity, using -1 for an unbound variable or -2 for a bound one - if the term is a variable, a pointer for its value - if the term is not a variable, a pointer to a unique string for its function symbol, and 0 or more pointers for its arguments. We're not going to bother using hash consing for these terms. Function symbols do not change at run time, but terms do. */ struct Term_Info { int arity; union { term binding; ustr name; } u; term arg[]; }; #define BOUND_VARIABLE (-2) #define UNBOUND_VARIABLE (-1) term make_anonymous_variable(void) { term r = emalloc(offsetof(struct Term_Info, arg[0])); r->arity = UNBOUND_VARIABLE; r->u.binding = r; return r; } term make_variable(ustr name) { term r = name->variable; if (r == 0) { r = emalloc(offsetof(struct Term_Info, arg[0])); r->arity = UNBOUND_VARIABLE; r->u.binding = r; name->variable = r; } return r; } /* just for efficiency, we use hash consing for atoms */ term make_atom(ustr name) { term r = name->atom; if (r == 0) { r = emalloc(offsetof(struct Term_Info, arg[0])); r->arity = 0; r->u.name = name; name->atom = r; } return r; } term make_term(ustr name, int arity, term const arg[]) { term r = emalloc(offsetof(struct Term_Info, arg[arity])); int i; r->arity = arity, r->u.name = name; for (i = 0; i < arity; i++) r->arg[i] = arg[i]; return r; } /*---------------------------------------------------------------------- Parsing */ static int more_input(void) { int c; do { c = getchar(); if (c == '%') { do c = getchar(); while (c >= 0 && c != '\n'); } if (c < 0) return 0; } while (c <= ' '); ungetc(c, stdin); return 1; } static int next_non_blank(void) { int c; do { c = getchar(); if (c == '%') { do c = getchar(); while (c >= 0 && c != '\n'); } if (c < 0) { fprintf(stderr, "Unexpected end of file\n"); exit(EXIT_FAILURE); } } while (c <= ' '); return c; } static void name_buffer_overflow(void) { fprintf(stderr, "Variable name or atom too long.\n"); exit(EXIT_FAILURE); } static void term_buffer_overflow(void) { fprintf(stderr, "Term or list too long.\n"); exit(EXIT_FAILURE); } static term nil_atom = 0; static ustr dot_name = 0; term read_term(void) { int c; char name[1024]; char *p, *e; term args[100]; term *t, *x; c = next_non_blank(); p = name; e = name + sizeof name; if (isupper(c) || c == '_') { do { if (p == e) name_buffer_overflow(); *p++ = c; c = getchar(); } while (isalnum(c) || c == '_'); ungetc(c, stdin); if (p == name+1 && name[0] == '_') { return make_anonymous_variable(); } else { return make_variable(find_mem(name, p-name)); } } if (c == '[') { if (nil_atom == 0) nil_atom = make_atom(find_mem("[]", 2)); c = next_non_blank(); if (c == ']') { return nil_atom; } else { term r, n; t = args; x = args + sizeof args; ungetc(c, stdin); do { if (t == x) term_buffer_overflow(); *t++ = read_term(); c = next_non_blank(); } while (c == ','); if (c == '|') { r = read_term(); c = next_non_blank(); } else { r = nil_atom; } if (c != ']') { fprintf(stderr, "Missing ]\n"); exit(EXIT_FAILURE); } if (dot_name == 0) dot_name = find_mem(".", 1); while (t != args) { n = emalloc(offsetof(struct Term_Info, arg[2])); n->arity = 2, n->u.name = dot_name, n->arg[0] = *--t, n->arg[1] = r; r = n; } return r; } } if (islower(c)) { do { if (p == e) name_buffer_overflow(); *p++ = c; c = getchar(); } while (isalnum(c) || c == '_'); } else if (c == '\'') { for (;;) { c = getchar(); if (c < 0) { fprintf(stderr, "Unexpected EOF\n"); exit(EXIT_FAILURE); } if (c == '\'') { c = getchar(); if (c != '\'') break; } if (p == e) name_buffer_overflow(); *p++ = c; } } if (c != '(') { ungetc(c, stdin); return make_atom(find_mem(name, p-name)); } else { t = args; x = args + sizeof args; do { if (t == x) term_buffer_overflow(); *t++ = read_term(); c = next_non_blank(); } while (c == ','); if (c != ')') { fprintf(stderr, "Missing )\n"); exit(EXIT_FAILURE); } return make_term(find_mem(name, p-name), t-args, args); } fprintf(stderr, "A term cannot begin with '%c'\n", c); exit(EXIT_FAILURE); } /*---------------------------------------------------------------------- Classifying and decomposing terms */ #define is_bound_variable(t) ((t)->arity == BOUND_VARIABLE) #define is_unbound_variable(t) ((t)->arity == UNBOUND_VARIABLE) #define is_variable(t) ((t)->arity < 0) #define is_not_variable(t) ((t)->arity >= 0) #define is_atom(t) ((t)->arity == 0) #define is_compound(t) ((t)->arity > 0) #define is_dotted_pair(t) ((t)->arity == 2 && (t)->u.name == dot_name) #define variable_binding(t) ((t)->u.binding) term dereference(term t) { while (is_bound_variable(t)) t = variable_binding(t); return t; } term argument(term t, int i) { t = dereference(t); if (i < 0 || i >= t->arity) { fprintf(stderr, "argument index out of range\n"); abort(); } return t->arg[i]; } /*---------------------------------------------------------------------- Printing */ void print_term(term t) { t = dereference(t); if (is_unbound_variable(t)) { printf("V%p", t); } else { char *p = t->u.name->name; size_t n = t->u.name->size; size_t i; if (t->u.name->quotes == 0) { for (i = 0; i != n; i++) putchar(p[i]); } else { putchar('\''); for (i = 0; i != n; i++) { if (p[i] == '\'') putchar('\''); putchar(p[i]); } putchar('\''); } if (is_compound(t)) { int a; for (a = 0; a < t->arity; a++) { printf(a == 0 ? "(" : ", "); print_term(t->arg[a]); } printf(")"); } } } #if 0 int main(void) { init_strtab(); while (more_input()) { print_term(read_term()); printf("\n"); } } #endif /*---------------------------------------------------------------------- Testing whether two terms are the same Two terms are the same if, after replacing bound variables by their values, either - both are the same unbound variable or - both have the same arity and function symbol, and corresponding arguments are the same. */ int same(term x, term y) { int i; x = dereference(x); y = dereference(y); if (is_unbound_variable(x)) return x == y; if (is_unbound_variable(y)) return 0; if (x->arity != y->arity) return 0; if (x->u.name != y->u.name) return 0; for (i = 0; i < x->arity; i++) { if (!same(x->arg[i], y->arg[i])) return 0; } return 1; } /*---------------------------------------------------------------------- Checking whether a term contains a variable. We don't want X = f(X) to succeed making X = f(f(f(f(f(f.... This check is called the "occurs check". For efficiency, most Prolog systems omit it. Alain Colmerauer showed that this is sort of legitimate as long as you admit to using a non-standard version of equality. */ int contains(term t, term v) { t = dereference(t); if (is_unbound_variable(t)) { return t == v; } else { int i; for (i = 0; i < t->arity; i++) { if (contains(t->arg[i], v)) return 1; } return 0; } } /*---------------------------------------------------------------------- In the unification process, we try to make two terms the same by binding variables to values. This is a side effect! If the match fails, we want to undo it. Indeed, in the context of a full system, we might be backtracking over alternative proofs, and would need to undo successful matches. So we maintain a "trail". In general, a trail is a history of inverse actions. Before performing a side effect E, you push -E on the trail, where -E is the action that will undo E. In our case, the only action is "bind a variable", so if we know what the variable is, we know everything we need. With hindsight, I would have done better to thread the trail through the variable terms themselves. I was too heavily influenced by Prolog, which uses some cleverness to avoid trailing bindings in the first place whenever they are sure to disappear before we need to undo them. */ term *trail_base = 0; int trail_top = 0; int trail_limit = 0; static void bind(term v, term t) { if (trail_top == trail_limit) { int new_limit = trail_top == 0 ? 1000 : trail_limit*2; term *new_trail = emalloc(new_limit * sizeof *trail_base); memcpy(new_trail, trail_base, trail_top * sizeof *trail_base); free(trail_base); trail_base = new_trail; trail_limit = new_limit; } trail_base[trail_top++] = v; v->u.binding = t; v->arity = BOUND_VARIABLE; } static void unwind_to(int old_top) { while (trail_top != old_top) { term v = trail_base[--trail_top]; v->u.binding = v; v->arity = UNBOUND_VARIABLE; } } /*---------------------------------------------------------------------- Copying a term replaces all the variables *consistently* by new ones. */ static term copy_worker(term t) { t = dereference(t); if (is_unbound_variable(t)) { if (variable_binding(t) == t) { term v = make_anonymous_variable(); bind(t, v); return v; } else { return t; } } else if (t->arity == 0) { return t; } else { int changed = 0; term arg_copy[t->arity]; int i; for (i = 0; i < t->arity; i++) { arg_copy[i] = copy_worker(t->arg[i]); changed |= arg_copy[i] != t->arg[i]; } if (changed) { return make_term(t->u.name, t->arity, arg_copy); } else { return t; } } } term copy_term(term t) { int old_trail = trail_top; term r = copy_worker(t); unwind_to(old_trail); return r; } /*---------------------------------------------------------------------- Now for the big one: U N I F I C A T I O N ! After all that preparation, it's simple. To unify two terms x and y: dereference them. if x is an unbound variable if y is x do nothing and succeed. if y is not x and y contains x, fail. otherwise bind x to y and succeed. if y is an unbound variable the same only the other way around. if x and y have different arities or function symbols, fail. otherwise, unify corresponding arguments, and fail if any of the recursive calls fails. */ static int unify_worker(term x, term y) { int i; x = dereference(x); y = dereference(y); if (is_unbound_variable(x)) { if (y == x) return 1; if (contains(y, x)) return 0; bind(x, y); } else if (is_unbound_variable(y)) { if (x == y) return 1; if (contains(x, y)) return 0; bind(y, x); } else { if (x->arity != y->arity) return 0; if (x->u.name != y->u.name) return 0; for (i = 0; i < x->arity; i++) { if (!unify_worker(x->arg[i], y->arg[i])) return 0; } } return 1; } int unify(term x, term y) { int old_trail = trail_top; if (unify_worker(x, y)) return 1; unwind_to(old_trail); return 0; } /*---------------------------------------------------------------------- Testing. */ int main(void) { term x, y; int old_trail; init_strtab(); while (more_input()) { x = read_term(); if (next_non_blank() != '=') { fprintf(stderr, "Missing =\n"); exit(EXIT_FAILURE); } y = read_term(); if (next_non_blank() != '.') { fprintf(stderr, "Missing .\n"); exit(EXIT_FAILURE); } old_trail = trail_top; if (unify_worker(x, y)) { printf("YES: "); print_term(x); printf(".\n\n"); } else { printf("NO\n\n"); } unwind_to(old_trail); } return 0; }