ref: 511421003a44633d8907828aef280acc762cc56b
dir: /sys/src/cmd/spell/sprog.c/
#include <u.h> #include <libc.h> #include <bio.h> #include <ctype.h> #include "code.h" /* fig leaves for possibly signed char quantities */ #define ISUPPER(c) isupper((c)&0xff) #define ISLOWER(c) islower((c)&0xff) #define ISALPHA(c) isalpha((c)&0xff) #define ISDIGIT(c) isdigit((c)&0xff) #define ISVOWEL(c) voweltab[(c)&0xff] #define Tolower(c) (ISUPPER(c)? (c)-'A'+'a': (c)) #define pair(a,b) (((a)<<8) | (b)) #define DLEV 2 #define DSIZ 40 typedef long Bits; #define Set(h, f) ((long)(h) & (f)) Bits nop(char*, char*, char*, int, int); Bits strip(char*, char*, char*, int, int); Bits ize(char*, char*, char*, int, int); Bits i_to_y(char*, char*, char*, int, int); Bits ily(char*, char*, char*, int, int); Bits subst(char*, char*, char*, int, int); Bits CCe(char*, char*, char*, int, int); Bits tion(char*, char*, char*, int, int); Bits an(char*, char*, char*, int, int); Bits s(char*, char*, char*, int, int); Bits es(char*, char*, char*, int, int); Bits bility(char*, char*, char*, int, int); Bits y_to_e(char*, char*, char*, int, int); Bits VCe(char*, char*, char*, int, int); Bits trypref(char*, char*, int, int); Bits tryword(char*, char*, int, int); Bits trysuff(char*, int, int); Bits dict(char*, char*); void typeprint(Bits); void pcomma(char*); void ise(void); int ordinal(void); char* skipv(char*); int inun(char*, Bits); char* ztos(char*); void readdict(char*); typedef struct Ptab Ptab; struct Ptab { char* s; int flag; }; typedef struct Suftab Suftab; struct Suftab { char *suf; Bits (*p1)(char*, char*, char*, int, int); int n1; char *d1; char *a1; int flag; int affixable; Bits (*p2)(char*, char*, char*, int, int); int n2; char *d2; char *a2; }; Suftab staba[] = { {"aibohp",subst,1,"-e+ia","",NOUN, NOUN}, 0 }; Suftab stabc[] = { {"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN}, {"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN}, {"citi",ize,1,"-e+ic","",N_AFFIX, ADJ }, {"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN }, {"cipocs",ize,1,"-e+ic","",NOUN, ADJ }, {"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ }, {"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ }, {"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ }, {"cibohp",subst,1,"-e+ic","",NOUN, ADJ }, 0 }; Suftab stabd[] = { {"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"}, {"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN}, 0 }; Suftab stabe[] = { /* * V_affix for comment ->commence->commentment?? */ {"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX}, {"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX}, {"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ}, {"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ}, {"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ}, {"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP}, {"ekil",strip,4,"","+like",N_AFFIX ,ADJ}, 0 }; Suftab stabg[] = { {"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN}, {"gnikam",strip,6,"","+making",NOUN,NOUN}, {"gnipeek",strip,7,"","+keeping",NOUN,NOUN}, {"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN}, 0 }; Suftab stabl[] = { {"ladio",strip,2,"","+al",NOUN |ADJ,ADJ}, {"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX}, {"latnem",strip,2,"","+al",N_AFFIX,ADJ}, {"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN}, {"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN}, 0 }; Suftab stabm[] = { /* congregational + ism */ {"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN}, {"margo",subst,-1,"-ph+m","",NOUN,NOUN}, 0 }; Suftab stabn[] = { {"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX}, {"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX}, {"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR}, {"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX}, {"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX}, {"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB}, {"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX}, {"nemow",strip,5,"","+women",MAN,PROP_COLLECT}, {"nem",strip,3,"","+man",MAN,PROP_COLLECT}, {"nosrep",strip,6,"","+person",MAN,PROP_COLLECT}, 0 }; Suftab stabp[] = { {"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX}, 0 }; Suftab stabr[] = { {"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"}, {"reyhparg",nop,0,"","",0,NOUN}, {"reyl",nop,0,"","",0,NOUN}, {"rekam",strip,5,"","+maker",NOUN,NOUN}, {"repeek",strip,6,"","+keeper",NOUN,NOUN}, {"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ, i_to_y,2,"-y+ier","+er"}, {"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y}, {"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX}, {"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX}, 0 }; Suftab stabs[] = { {"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX}, {"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ }, {"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH , es,2,"-y+ies","+es"}, {"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH }, {"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH }, 0 }; Suftab stabt[] = { {"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB}, {"tse",strip,2,"","+st",EST,DONT_TOUCH, i_to_y,3,"-y+iest","+est" }, {"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX}, {"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP}, 0 }; Suftab staby[] = { {"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX}, {"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX}, {"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX}, {"ytisuo",nop,0,"","",NOUN}, {"ytilb",nop,0,"","",0,NOUN}, {"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX }, {"ylb",y_to_e,1,"-e+y","",ADJ,ADV}, {"ylc",nop,0,"","",0}, {"ylelb",nop,0,"","",0}, {"ylelp",nop,0,"","",0}, {"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP}, {"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX}, {"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP}, 0 }; Suftab stabz[] = { 0 }; Suftab* suftab[] = { staba, stabz, stabc, stabd, stabe, stabz, stabg, stabz, stabz, stabz, stabz, stabl, stabm, stabn, stabz, stabp, stabz, stabr, stabs, stabt, stabz, stabz, stabz, stabz, staby, stabz, }; Ptab ptaba[] = { "anti", 0, "auto", 0, 0 }; Ptab ptabb[] = { "bio", 0, 0 }; Ptab ptabc[] = { "counter", 0, 0 }; Ptab ptabd[] = { "dis", 0, 0 }; Ptab ptabe[] = { "electro", 0, 0 }; Ptab ptabf[] = { "femto", 0, 0 }; Ptab ptabg[] = { "geo", 0, "giga", 0, 0 }; Ptab ptabh[] = { "hyper", 0, 0 }; Ptab ptabi[] = { "immuno", 0, "im", IN, "intra", 0, "inter", 0, "in", IN, "ir", IN, "iso", 0, 0 }; Ptab ptabj[] = { 0 }; Ptab ptabk[] = { "kilo", 0, 0 }; Ptab ptabl[] = { 0 }; Ptab ptabm[] = { "magneto", 0, "mega", 0, "meta", 0, "micro", 0, "mid", 0, "milli", 0, "mini", 0, "mis", 0, "mono", 0, "multi", 0, 0 }; Ptab ptabn[] = { "nano", 0, "neuro", 0, "non", 0, 0 }; Ptab ptabo[] = { "out", 0, "over", 0, 0 }; Ptab ptabp[] = { "para", 0, "photo", 0, "pico", 0, "poly", 0, "pre", 0, "pseudo", 0, "psycho", 0, 0 }; Ptab ptabq[] = { "quasi", 0, 0 }; Ptab ptabr[] = { "radio", 0, "re", 0, 0 }; Ptab ptabs[] = { "semi", 0, "stereo", 0, "sub", 0, "super", 0, 0 }; Ptab ptabt[] = { "tele", 0, "tera", 0, "thermo", 0, 0 }; Ptab ptabu[] = { "ultra", 0, "under", 0, /*must precede un*/ "un", IN, 0 }; Ptab ptabv[] = { 0 }; Ptab ptabw[] = { 0 }; Ptab ptabx[] = { 0 }; Ptab ptaby[] = { 0 }; Ptab ptabz[] = { 0 }; Ptab* preftab[] = { ptaba, ptabb, ptabc, ptabd, ptabe, ptabf, ptabg, ptabh, ptabi, ptabj, ptabk, ptabl, ptabm, ptabn, ptabo, ptabp, ptabq, ptabr, ptabs, ptabt, ptabu, ptabv, ptabw, ptabx, ptaby, ptabz, }; typedef struct { char *mesg; enum { NONE, SUFF, PREF} type; } Deriv; int aflag; int cflag; int fflag; int vflag; int xflag; int nflag; char word[500]; char* original; Deriv emptyderiv; Deriv deriv[DSIZ+3]; char affix[DSIZ*10]; /* 10 is longest affix message */ int prefcount; int suffcount; char* acmeid; char space[300000]; /* must be as large as "words"+"space" in pcode run */ Bits encode[2048]; /* must be as long as "codes" in pcode run */ int nencode; char voweltab[256]; char* spacep[128*128+1]; /* pointer to words starting with 'xx' */ Biobuf bin; Biobuf bout; char* codefile = "/sys/lib/amspell"; char* brfile = "/sys/lib/brspell"; char* Usage = "usage"; void main(int argc, char *argv[]) { char *ep, *cp; char *dp; int j, i, c; int low; Bits h; Binit(&bin, 0, OREAD); Binit(&bout, 1, OWRITE); for(i=0; c = "aeiouyAEIOUY"[i]; i++) voweltab[c] = 1; while(argc > 1) { if(argv[1][0] != '-') break; for(i=1; c = argv[1][i]; i++) switch(c) { default: fprint(2, "usage: spell [-bcCvx] [-f file]\n"); exits(Usage); case 'a': aflag++; continue; case 'b': ise(); if(!fflag) codefile = brfile; continue; case 'C': /* for "correct" */ vflag++; case 'c': /* for ocr */ cflag++; continue; case 'v': vflag++; continue; case 'x': xflag++; continue; case 'f': if(argc <= 2) { fprint(2, "spell: -f requires another argument\n"); exits(Usage); } argv++; argc--; codefile = argv[1]; fflag++; goto brk; } brk: argv++; argc--; } readdict(codefile); if(argc > 1) { fprint(2, "usage: spell [-bcCvx] [-f file]\n"); exits(Usage); } if(aflag) cflag = vflag = 0; for(;;) { affix[0] = 0; original = Brdline(&bin, '\n'); if(original == 0) exits(0); original[Blinelen(&bin)-1] = 0; low = 0; if(aflag) { acmeid = original; while(*original != ':') if(*original++ == 0) exits(0); while(*++original != ':') if(*original == 0) exits(0); *original++ = 0; } for(ep=word,dp=original; j = *dp; ep++,dp++) { if(ISLOWER(j)) low++; if(ep >= word+sizeof(word)-1) break; *ep = j; } *ep = 0; if(ISDIGIT(word[0]) && ordinal()) continue; h = 0; if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))) for(cp=original+1,dp=word+1; dp<ep; dp++,cp++) *dp = Tolower(*cp); if(!h) for(;;) { /* at most twice */ if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)) break; if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH)) break; if(!ISUPPER(word[0])) break; cp = original; dp = word; while(*dp = *cp++) { if(!low) *dp = Tolower(*dp); dp++; } word[0] = Tolower(word[0]); } if(cflag) { if(!h || Set(h,STOP)) print("-"); else if(!vflag) print("+"); else print("%c",'0' + (suffcount>0) + (prefcount>4? 8: 2*prefcount)); } else if(!h || Set(h,STOP)) { if(aflag) Bprint(&bout, "%s:%s\n", acmeid, original); else Bprint(&bout, "%s\n", original); } else if(affix[0] != 0 && affix[0] != '.') print("%s\t%s\n", affix, original); } /* not reached */ } /* strip exactly one suffix and do * indicated routine(s), which may recursively * strip suffixes */ Bits trysuff(char* ep, int lev, int flag) { Suftab *t; char *cp, *sp; Bits h = 0; int initchar = ep[-1]; flag &= ~MONO; lev += DLEV; if(lev < DSIZ) { deriv[lev] = emptyderiv; deriv[lev-1] = emptyderiv; } if(!ISLOWER(initchar)) return h; for(t=suftab[initchar-'a']; sp=t->suf; t++) { cp = ep; while(*sp) if(*--cp != *sp++) goto next; for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);) ; if(sp < word) continue; if(!(t->affixable & flag)) return 0; h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP); if(!h && t->p2!=0) { if(lev < DSIZ) { deriv[lev] = emptyderiv; deriv[lev+1] = emptyderiv; } h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP); } break; next:; } return h; } Bits nop(char* ep, char* d, char* a, int lev, int flag) { USED(ep, d, a, lev, flag); return 0; } Bits cstrip(char* ep, char* d, char* a, int lev, int flag) { int temp = ep[0]; if(ISVOWEL(temp) && ISVOWEL(ep[-1])) { switch(pair(ep[-1],ep[0])) { case pair('a', 'a'): case pair('a', 'e'): case pair('a', 'i'): case pair('e', 'a'): case pair('e', 'e'): case pair('e', 'i'): case pair('i', 'i'): case pair('o', 'a'): return 0; } } else if(temp==ep[-1]&&temp==ep[-2]) return 0; return strip(ep,d,a,lev,flag); } Bits strip(char* ep, char* d, char* a, int lev, int flag) { Bits h = trypref(ep, a, lev, flag); USED(d); if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2])) h = 0; if(h) return h; if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) { h = trypref(ep-1,a,lev,flag|MONO); if(h) return h; } return trysuff(ep,lev,flag); } Bits s(char* ep, char* d, char* a, int lev, int flag) { if(lev > DLEV+1) return 0; if(*ep=='s') { switch(ep[-1]) { case 'y': if(ISVOWEL(ep[-2])||ISUPPER(*word)) break; /*says Kennedys*/ case 'x': case 'z': case 's': return 0; case 'h': switch(ep[-2]) { case 'c': case 's': return 0; } } } return strip(ep,d,a,lev,flag); } Bits an(char* ep, char* d, char* a, int lev, int flag) { USED(d); if(!ISUPPER(*word)) /*must be proper name*/ return 0; return trypref(ep,a,lev,flag); } Bits ize(char* ep, char* d, char* a, int lev, int flag) { int temp = ep[-1]; Bits h; USED(a); ep[-1] = 'e'; h = strip(ep,"",d,lev,flag); ep[-1] = temp; return h; } Bits y_to_e(char* ep, char* d, char* a, int lev, int flag) { Bits h; int temp; USED(a); switch(ep[-1]) { case 'a': case 'e': case 'i': return 0; } temp = *ep; *ep++ = 'e'; h = strip(ep,"",d,lev,flag); ep[-1] = temp; return h; } Bits ily(char* ep, char* d, char* a, int lev, int flag) { int temp = ep[0]; char *cp = ep; if(temp==ep[-1]&&temp==ep[-2]) /* sillly */ return 0; if(*--cp=='y' && !ISVOWEL(*--cp)) /* happyly */ while(cp>word) if(ISVOWEL(*--cp)) /* shyness */ return 0; if(ep[-1]=='i') return i_to_y(ep,d,a,lev,flag); return cstrip(ep,d,a,lev,flag); } Bits bility(char* ep, char* d, char* a, int lev, int flag) { *ep++ = 'l'; return y_to_e(ep,d,a,lev,flag); } Bits i_to_y(char* ep, char* d, char* a, int lev, int flag) { Bits h; int temp; if(ISUPPER(*word)) return 0; if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) { ep[-1] = 'y'; a = d; } h = cstrip(ep,"",a,lev,flag); ep[-1] = temp; return h; } Bits es(char* ep, char* d, char* a, int lev, int flag) { if(lev>DLEV) return 0; switch(ep[-1]) { default: return 0; case 'i': return i_to_y(ep,d,a,lev,flag); case 'h': switch(ep[-2]) { default: return 0; case 'c': case 's': break; } case 's': case 'z': case 'x': return strip(ep,d,a,lev,flag); } } Bits subst(char* ep, char* d, char* a, int lev, int flag) { char *u,*t; Bits h; USED(a); if(skipv(skipv(ep-1)) < word) return 0; for(t=d; *t!='+'; t++) continue; for(u=ep; *--t!='-';) *--u = *t; h = strip(ep,"",d,lev,flag); while(*++t != '+') continue; while(*++t) *u++ = *t; return h; } Bits tion(char* ep, char* d, char* a, int lev, int flag) { switch(ep[-2]) { default: return trypref(ep,a,lev,flag); case 'a': case 'e': case 'i': case 'o': case 'u': return y_to_e(ep,d,a,lev,flag); } } /* * possible consonant-consonant-e ending */ Bits CCe(char* ep, char* d, char* a, int lev, int flag) { Bits h; switch(ep[-1]) { case 'l': if(ISVOWEL(ep[-2])) break; switch(ep[-2]) { case 'l': case 'r': case 'w': break; default: return y_to_e(ep,d,a,lev,flag); } break; case 'c': case 'g': if(*ep == 'a') /* prevent -able for -eable */ return 0; case 's': case 'v': case 'z': if(ep[-2]==ep[-1]) break; if(ISVOWEL(ep[-2])) break; case 'u': if(h = y_to_e(ep,d,a,lev,flag)) return h; if(!(ep[-2]=='n' && ep[-1]=='g')) return 0; } return VCe(ep,d,a,lev,flag); } /* * possible consonant-vowel-consonant-e ending */ Bits VCe(char* ep, char* d, char* a, int lev, int flag) { int c; Bits h; c = ep[-1]; if(c=='e') return 0; if(!ISVOWEL(c) && ISVOWEL(ep[-2])) { c = *ep; *ep++ = 'e'; h = trypref(ep,d,lev,flag); if(!h) h = trysuff(ep,lev,flag); if(h) return h; ep--; *ep = c; } return cstrip(ep,d,a,lev,flag); } Ptab* lookuppref(uchar** wp, char* ep) { Ptab *sp; uchar *bp,*cp; unsigned int initchar = Tolower(**wp); if(!ISALPHA(initchar)) return 0; for(sp=preftab[initchar-'a'];sp->s;sp++) { bp = *wp; for(cp= (uchar*)sp->s;*cp; ) if(*bp++!=*cp++) goto next; for(cp=bp;cp<(uchar*)ep;cp++) if(ISVOWEL(*cp)) { *wp = bp; return sp; } next:; } return 0; } /* while word is not in dictionary try stripping * prefixes. Fail if no more prefixes. */ Bits trypref(char* ep, char* a, int lev, int flag) { Ptab *tp; char *bp, *cp; char *pp; Bits h; char space[20]; if(lev<DSIZ) { deriv[lev].mesg = a; deriv[lev].type = *a=='.'? NONE: SUFF; } if(h = tryword(word,ep,lev,flag)) { if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO)) return h; h = 0; } bp = word; pp = space; if(lev<DSIZ) { deriv[lev+1].mesg = pp; deriv[lev+1].type = 0; } while(tp=lookuppref((uchar**)&bp,ep)) { *pp++ = '+'; cp = tp->s; while(pp<space+sizeof(space) && (*pp = *cp++)) pp++; deriv[lev+1].type += PREF; h = tryword(bp,ep,lev+1,flag); if(Set(h,NOPREF) || ((tp->flag&IN) && inun(bp-2,h)==0)) { h = 0; break; } if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO)) break; h = 0; } if(lev < DSIZ) { deriv[lev+1] = emptyderiv; deriv[lev+2] = emptyderiv; } return h; } Bits tryword(char* bp, char* ep, int lev, int flag) { int j; Bits h = 0; char duple[3]; if(ep-bp <= 1) return h; if(flag&MONO) { if(lev<DSIZ) { deriv[++lev].mesg = duple; deriv[lev].type = SUFF; } duple[0] = '+'; duple[1] = *ep; duple[2] = 0; } h = dict(bp, ep); if(vflag==0 || h==0) return h; /* * when derivations are wanted, collect them * for printing */ j = lev; prefcount = suffcount = 0; do { if(j<DSIZ && deriv[j].type) { strcat(affix, deriv[j].mesg); if(deriv[j].type == SUFF) suffcount++; else if(deriv[j].type != NONE) prefcount = deriv[j].type/PREF; } } while(--j > 0); return h; } int inun(char* bp, Bits h) { if(*bp == 'u') return Set(h, IN) == 0; /* *bp == 'i' */ if(Set(h, IN) == 0) return 0; switch(bp[2]) { case 'r': return bp[1] == 'r'; case 'm': case 'p': return bp[1] == 'm'; } return bp[1] == 'n'; } char* skipv(char *s) { if(s >= word && ISVOWEL(*s)) s--; while(s >= word && !ISVOWEL(*s)) s--; return s; } /* * crummy way to Britishise */ void ise(void) { Suftab *p; int i; for(i=0; i<26; i++) for(p = suftab[i]; p->suf; p++) { p->suf = ztos(p->suf); p->d1 = ztos(p->d1); p->a1 = ztos(p->a1); } } char* ztos(char *as) { char *s, *ds; for(s=as; *s; s++) if(*s == 'z') goto copy; return as; copy: ds = strdup(as); for(s=ds; *s; s++) if(*s == 'z') *s = 's'; return ds; } Bits dict(char* bp, char* ep) { char *cp, *cp1, *w, *wp, *we; int n, f; w = bp; we = ep; n = ep-bp; if(n <= 1) return NOUN; f = w[0] & 0x7f; f *= 128; f += w[1] & 0x7f; bp = spacep[f]; ep = spacep[f+1]; loop: if(bp >= ep) { if(xflag) fprint(2, "=%.*s\n", utfnlen(w, n), w); return 0; } /* * find the beginning of some word in the middle */ cp = bp + (ep-bp)/2; while(cp > bp && !(*cp & 0x80)) cp--; while(cp > bp && (cp[-1] & 0x80)) cp--; wp = w + 2; /* skip two letters */ cp1 = cp + 2; /* skip affix code */ for(;;) { if(wp >= we) { if(*cp1 & 0x80) goto found; else f = 1; break; } if(*cp1 & 0x80) { f = -1; break; } f = *cp1++ - *wp++; if(f != 0) break; } if(f < 0) { while(!(*cp1 & 0x80)) cp1++; bp = cp1; goto loop; } ep = cp; goto loop; found: f = ((cp[0] & 0x7) << 8) | (cp[1] & 0xff); if(xflag) { fprint(2, "=%.*s ", utfnlen(w, n), w); typeprint(encode[f]); } return encode[f]; } void typeprint(Bits h) { pcomma(""); if(h & NOUN) pcomma("n"); if(h & PROP_COLLECT) pcomma("pc"); if(h & VERB) { if((h & VERB) == VERB) pcomma("v"); else if((h & VERB) == V_IRREG) pcomma("vi"); else if(h & ED) pcomma("ed"); } if(h & ADJ) pcomma("a"); if(h & COMP) { if((h & COMP) == ACTOR) pcomma("er"); else pcomma("comp"); } if(h & DONT_TOUCH) pcomma("d"); if(h & N_AFFIX) pcomma("na"); if(h & ADV) pcomma("adv"); if(h & ION) pcomma("ion"); if(h & V_AFFIX) pcomma("va"); if(h & MAN) pcomma("man"); if(h & NOPREF) pcomma("nopref"); if(h & MONO) pcomma("ms"); if(h & IN) pcomma("in"); if(h & _Y) pcomma("y"); if(h & STOP) pcomma("s"); fprint(2, "\n"); } void pcomma(char *s) { static flag; if(*s == 0) { flag = 0; return; } if(!flag) { fprint(2, "%s", s); flag = 1; } else fprint(2, ",%s", s); } /* * is the word on of the following * 12th teen * 21st end in 1 * 23rd end in 3 * 77th default * called knowing word[0] is a digit */ int ordinal(void) { char *cp = word; static char sp[4]; while(ISDIGIT(*cp)) cp++; strncpy(sp,cp,3); if(ISUPPER(cp[0]) && ISUPPER(cp[1])) { sp[0] = Tolower(cp[0]); sp[1] = Tolower(cp[1]); } return 0 == strncmp(sp, cp[-2]=='1'? "th": /* out of bounds if 1 digit */ *--cp=='1'? "st": /* harmless */ *cp=='2'? "nd": *cp=='3'? "rd": "th", 3); } /* * read in the dictionary. * format is * { * short nencode; * long encode[nencode]; * char space[*]; * }; * * the encodings are a table all different * affixes. * the dictionary proper has 2 bytes * that demark and then the rest of the * word. the 2 bytes have the following * 0x80 0x00 flag * 0x78 0x00 count of prefix bytes * common with prev word * 0x07 0xff affix code * * all ints are big endians in the file. */ void readdict(char *file) { char *s, *is, *lasts, *ls; int c, i, sp, p; int f; long l; lasts = 0; f = open(file, 0); if(f == -1) { fprint(2, "cannot open %s\n", file); exits("open"); } if(read(f, space, 2) != 2) goto bad; nencode = ((space[0]&0xff)<<8) | (space[1]&0xff); if(read(f, space, 4*nencode) != 4*nencode) goto bad; s = space; for(i=0; i<nencode; i++) { l = (long)(s[0] & 0xff) << 24; l |= (s[1] & 0xff) << 16; l |= (s[2] & 0xff) << 8; l |= s[3] & 0xff; encode[i] = (Bits)l; s += 4; } l = read(f, space, sizeof(space)); if(l == sizeof(space)) goto noroom; is = space + (sizeof(space) - l); memmove(is, space, l); s = space; c = *is++ & 0xff; sp = -1; i = 0; loop: if(s > is) goto noroom; if(c < 0) { close(f); while(sp < 128*128) spacep[++sp] = s; *s = 0x80; /* fence */ return; } p = (c>>3) & 0xf; *s++ = c; *s++ = *is++ & 0xff; if(p <= 0) i = (*is++ & 0xff)*128; if(p <= 1) { if(!(*is & 0x80)) i = i/128*128 + (*is++ & 0xff); if(i <= sp) { fprint(2, "the dict isnt sorted or \n"); fprint(2, "memmove didn't work\n"); goto bad; } while(sp < i) spacep[++sp] = s-2; } ls = lasts; lasts = s; for(p-=2; p>0; p--) *s++ = *ls++; for(;;) { if(is >= space+sizeof(space)) { c = -1; break; } c = *is++ & 0xff; if(c & 0x80) break; *s++ = c; } *s = 0; goto loop; bad: fprint(2, "trouble reading %s\n", file); exits("read"); noroom: fprint(2, "not enough space for dictionary\n"); exits("space"); }