shithub: riscv

ref: dfb4b522830edab8f3856289d326d6cf5e930644
dir: /sys/src/cmd/dict/comfix.awk/

View raw version
# when raw index has a lot of entries like
# 1578324	problematico, a, ci, che
# apply this algorithm:
#  treat things after comma as suffixes
#  for each suffix:
#      if single letter, replace last letter
#      else search backwards for beginning of suffix
#      and if it leads to an old suffix of approximately
#      the same length, put replace that suffix
# This will still leave some commas to fix by hand
# Usage: awk -F'	' -f comfix.awk rawindex > newrawindex

NF == 2	{
		i = index($2, ",")
		if(i == 0 || length($2) == 0)
			print $0
		else {
			n = split($2, a, /,[ ]*/)
			w = a[1]
			printf "%s\t%s\n", $1, w
			for(i = 2; i <= n; i++) {
				suf = a[i]
				m = matchsuflen(w, suf)
				if(m) {
					nw = substr(w, 1, length(w)-m) suf
					printf "%s\t%s\n", $1, nw
				} else
					printf "%s\t%s\n", $1, w ", " suf
			}
		}
	}
NF != 2 {
	print $0
	}

function matchsuflen(w, suf,		wlen,suflen,c,pat,k,d)
{
	wlen = length(w)
	suflen = length(suf)
	if(suflen == 1)
		return 1
	else {
		c = substr(suf, 1, 1)
		for (k = 1; k <= wlen ; k++)
			if(substr(w, wlen-k+1, 1) == c)
				break
		if(k > wlen)
			return 0
		d = k-suflen
		if(d < 0)
			d = -d
		if(d > 3)
			return 0
		return k
	}
}