ref: 9ff7598b92db75af690f20caeb41fae9ed9acdb2
dir: /sys/src/libc/port/runenorm.c/
#include <u.h>
#include <libc.h>
#include "runenormdata"
//Unicode Standard: Section 3.12 Conjoining Jamo Behavior
enum {
SBase = 0xAC00,
LBase = 0x1100,
VBase = 0x1161,
TBase = 0x11A7,
LCount = 19,
VCount = 21,
TCount = 28,
NCount = VCount * TCount,
SCount = LCount * NCount,
LLast = LBase + LCount - 1,
SLast = SBase + SCount - 1,
VLast = VBase + VCount - 1,
TLast = TBase + TCount - 1,
};
/*
* Most runes decompose in to one/two
* other runes with codepoints < 0xFFFF,
* however there are some exceptions.
* To keep the table size down we instead
* store an index in to an exception range
* within the private use section and use
* an exception table.
*/
enum {
Estart = 0xEEEE,
Estop = 0xF8FF,
};
static Rune
_runedecomp(Rune c, Rune *r2)
{
uint x;
if(c < Runeself){
*r2 = 0;
return 0;
}
//korean
if(c >= SBase && c <= SLast){
c -= SBase;
x = c % TCount;
if(x){
*r2 = TBase + x;
return SBase + (c - x);
}
*r2 = VBase + ((c % NCount) / TCount);
return LBase + (c / NCount);
}
x = decomplkup(c);
if((x & 0xFFFF) != 0){
*r2 = x & 0xFFFF;
return x>>16;
}
x >>= 16;
if(x >= Estart && x < Estop){
Rune *r;
r = _decompexceptions[x - Estart];
*r2 = r[1];
return r[0];
}
*r2 = 0;
return x;
}
static Rune
_runerecomp(Rune r0, Rune r1)
{
uint x, y, *p, next;
if(r0 >= LBase && r0 <= LLast){
if(r1 < VBase || r1 > VLast)
return 0;
x = (r0 - LBase) * NCount + (r1 - VBase) * TCount;
return SBase + x;
}
if(r0 >= SBase && r0 <= SLast && (r0 - SBase) % TCount == 0){
if(r1 > TBase && r1 <= TLast)
return r0 + (r1 - TBase);
return 0;
}
if(r0 > 0xFFFF || r1 > 0xFFFF){
for(x = 0; x < nelem(_recompexceptions); x++)
if(r0 == _recompexceptions[x][1] && r1 == _recompexceptions[x][2])
return _recompexceptions[x][0];
return 0;
}
y = x = r0<<16 | r1;
x ^= x >> 16;
x *= 0x21f0aaad;
x ^= x >> 15;
x *= 0xd35a2d97;
x ^= x >> 15;
p = _recompdata + (x%512)*2;
while(p[0] != y){
next = p[1]>>16;
if(!next)
return 0;
p = _recompcoll + (next-1)*2;
}
return p[1] & 0xFFFF;
}
static void
runecccsort(Rune *a, int len)
{
Rune r;
int i, j;
for(i = 1; i < len; i++){
r = a[i];
for(j = i; j > 0 && ccclkup(a[j-1]) > ccclkup(r); j--)
a[j] = a[j-1];
a[j] = r;
}
}
static int
boundary(Rune r)
{
return !(qclkup(r) & (Qnfcno|Qnfcmay));
}
/*
* Stk stores the entire context for a chunk of
* an input string that is being normalized.
* In accordance to the standard, Unicode text
* has no upper bound for the amount of conjoining
* (also called non-starter) elements associated with
* a base rune. Thus to implement normalization within
* reasonable memory constraints we implement the
* "Stream-Safe Text Format" as defined in UAX #15 § 13.
*/
typedef struct {
Rune a[Maxnormctx];
Rune *e;
} Stk;
static int
push(Stk *s, Rune c)
{
int n, l;
Rune r2, b[Maxdecomp];
Rune *p = b + nelem(b) - 1;
for(*p = c; c = _runedecomp(c, &r2); *p = c){
assert(p > b);
if(r2 != 0)
*p-- = r2;
}
n = b + nelem(b) - p;
l = nelem(s->a) - (s->e - s->a);
if(n > l){
werrstr("runenorm: buffer overflow");
return -1;
}
l -= n;
for(; n > 0; n--)
*s->e++ = *p++;
return l;
}
/*
* Worst case recomposition, this happens when we have to compose
* two runes who both have a CCC of zero.
*/
static void
worstrecomp(Stk *s)
{
int done;
Rune c, *p, *rp;
for(done = 0; done == 0;){
done = 1;
for(p = s->a; p+1 < s->e; p++){
c = _runerecomp(p[0], p[1]);
if(c == 0)
continue;
done = 0;
*p = c;
for(rp = p+1; rp < s->e-1; rp++)
rp[0] = rp[1];
s->e--;
p--;
}
}
}
static void
cccrecomp(Stk *s)
{
Rune c, *p, *rp;
for(p = s->a + 1; p < s->e; p++){
c = _runerecomp(s->a[0], *p);
if(c != 0){
s->a[0] = c;
for(rp = p; rp < s->e-1; rp++){
rp[0] = rp[1];
}
s->e--;
p--;
} else while(p + 1 < s->e && ccclkup(p[0]) == ccclkup(p[1]))
p++;
}
}
void
norminit(Norm *n, int compose, void *ctx, long (*getrune)(void*))
{
memset(n, 0, sizeof *n);
n->ctx = ctx;
n->getrune = getrune;
n->compose = compose;
n->obuf.e = n->obuf.a;
n->ibuf.e = n->ibuf.a;
}
int NORMDEBUG;
static long
peekrune(Norm *n)
{
long r;
if(n->ibuf.e > n->ibuf.a)
return n->ibuf.e[-1];
r = n->getrune(n->ctx);
if(r >= 0)
*n->ibuf.e++ = r;
return r;
}
static long
getrune(Norm *n)
{
if(n->ibuf.e > n->ibuf.a)
return *--n->ibuf.e;
return n->getrune(n->ctx);
}
long
normpull(Norm *n, Rune *rdst, long max, int flush)
{
Rune *rp, *re;
Stk stk;
Rune *dot;
int r;
long c;
rp = rdst;
re = rdst + max;
dot = nil;
c = 0;
while(rp < re){
if(n->obuf.e != n->obuf.a){
memcpy(stk.a, n->obuf.a, (n->obuf.e - n->obuf.a)*sizeof(Rune));
stk.e = stk.a + (n->obuf.e - n->obuf.a);
n->obuf.e = n->obuf.a;
c = stk.a[0];
goto Flush;
}
stk.e = stk.a;
c = getrune(n);
if(c < 0)
break;
push(&stk, c);
c = peekrune(n);
if(stk.e == stk.a+1 && stk.a[0] < Runeself && c < Runeself && c >= 0)
goto Flush;
while(c >= 0 && ccclkup(c) != 0){
r = push(&stk, getrune(n));
c = peekrune(n);
if(r > 2)
continue;
if(ccclkup(stk.a[0]) != 0){
assert(r > 0);
r--;
} else
assert(r >= 0);
if(r == 0 || (c == 0x0344 && r < 2)){
/* in reverse */
if(r > 0){
getrune(n);
*n->ibuf.e++ = 0x301;
*n->ibuf.e++ = 0x308;
}
*n->ibuf.e++ = 0x034F;
break;
}
}
if(stk.e - stk.a > 1)
runecccsort(stk.a, stk.e - stk.a);
if(!n->compose)
goto Flush;
if(ccclkup(stk.e[-1]) == 0){
Rune tmp;
while(c >= 0 && (!boundary(c) || !boundary(_runedecomp(c, &tmp)))){
if(push(&stk, getrune(n)) == -1){
*n->ibuf.e++ = c;
for(r = 0; r < Maxdecomp; r++)
*n->ibuf.e++ = *--stk.e;
break;
}
c = peekrune(n);
}
worstrecomp(&stk);
} else if(ccclkup(stk.a[0]) == 0)
cccrecomp(&stk);
Flush:
if(flush || c >= 0)
for(dot = stk.a; dot < stk.e; dot++){
if(rp == re)
goto Out;
*rp++ = *dot;
}
dot = nil;
if(c < 0)
break;
}
Out:
if(c < 0 && !flush){
while(stk.e > stk.a)
*n->ibuf.e++ = *--stk.e;
}
if(dot != nil){
memcpy(n->obuf.a, dot, (stk.e - dot) * sizeof(Rune));
n->obuf.e = n->obuf.a + (stk.e - dot);
}
return rp - rdst;
}
typedef struct {
Rune *s, *p;
int n;
} Rctx;
static long
runegetrune(void *ctx)
{
Rctx *c;
c = ctx;
if(c->p >= c->s + c->n)
return -1;
return *c->p++;
}
static long
runedostr(Rune *dst, long ndst, Rune *src, long nsrc, int comp)
{
Rctx c;
Norm n;
c.s = c.p = src;
c.n = nsrc;
norminit(&n, comp, &c, runegetrune);
return normpull(&n, dst, ndst, 1);
}
long
runecomp(Rune *dst, long ndst, Rune *src, long nsrc)
{
return runedostr(dst, ndst, src, nsrc, 1);
}
long
runedecomp(Rune *dst, long ndst, Rune *src, long nsrc)
{
return runedostr(dst, ndst, src, nsrc, 0);
}
typedef struct {
char *s, *p;
int n;
} Uctx;
static long
utfgetrune(void *ctx)
{
Uctx *c;
Rune r;
c = ctx;
if(c->p >= c->s + c->n)
return -1;
c->p += chartorune(&r, c->p);
return r;
}
static long
utfdostr(char *dst, long ndst, char *src, long nsrc, int comp)
{
Uctx c;
Norm n;
Rune buf[Maxnormctx];
long i, w;
char *e, *p;
c.s = c.p = src;
c.n = nsrc;
norminit(&n, comp, &c, utfgetrune);
for(p = dst, e = dst + ndst; p < e;){
w = normpull(&n, buf, nelem(buf), 1);
if(w == 0)
break;
for(i = 0; i < w; i++){
if(p + runelen(buf[i]) >= e)
break;
p += runetochar(p, buf+i);
}
}
return p - dst;
}
long
utfcomp(char *dst, long ndst, char *src, long nsrc)
{
return utfdostr(dst, ndst, src, nsrc, 1);
}
long
utfdecomp(char *dst, long ndst, char *src, long nsrc)
{
return utfdostr(dst, ndst, src, nsrc, 0);
}