shithub: MicroHs

Download patch

ref: b2c6861bee0841dbe2d99cc19c2512630094b5a0
parent: b1c5646248fdf3bc73ea9bcee3d158c27c56e5fe
author: Lennart Augustsson <lennart.augustsson@epicgames.com>
date: Mon Jan 8 07:48:20 EST 2024

Some UTF8 stuff

--- a/src/runtime/bfile.c
+++ b/src/runtime/bfile.c
@@ -9,10 +9,36 @@
 /* BFILE will have different implementations, they all have these methods */
 typedef struct BFILE {
   int (*getb)(struct BFILE*);
-  void (*ungetb)(int c, struct BFILE*);
+  void (*ungetb)(int, struct BFILE*);
+  void (*putb)(int, struct BFILE*);
   void (*closeb)(struct BFILE*);
 } BFILE;
 
+static inline int
+getb(struct BFILE *p)
+{
+  return p->getb(p);
+}
+
+static inline void
+ungetb(int c, struct BFILE *p)
+{
+  p->ungetb(c, p);
+}
+
+static inline void
+putb(int c, struct BFILE *p)
+{
+  p->putb(c, p);
+}
+
+static inline void
+closeb(struct BFILE *p)
+{
+  p->closeb(p);
+}
+
+
 /***************** BFILE from static buffer *******************/
 struct BFILE_buffer {
   BFILE    mets;
@@ -45,6 +71,8 @@
   (void)bp;                     /* shut up warning */
 }
 
+/* There is no open().  Only used with statically allocated buffers. */
+
 #if WANT_STDIO
 /***************** BFILE via FILE *******************/
 struct BFILE_file {
@@ -67,6 +95,13 @@
 }
 
 void
+putb_file(int c, BFILE *bp)
+{
+  struct BFILE_file *p = (struct BFILE_file *)bp;
+  (void)fputc(c, p->file);
+}
+
+void
 closeb_file(BFILE *bp)
 {
   struct BFILE_file *p = (struct BFILE_file *)bp;
@@ -81,6 +116,7 @@
     memerr();
   p->mets.getb   = getb_file;
   p->mets.ungetb = ungetb_file;
+  p->mets.putb   = putb_file;
   p->mets.closeb = closeb_file;
   p->file = f;
   return (BFILE*)p;
@@ -151,9 +187,9 @@
   int c, n;
 
   /* Do we have an ungetb character? */
-  if (p->unget) {
+  if (p->unget >= 0) {
     c = p->unget;
-    p->unget = 0;
+    p->unget = -1;
     return c;
   }
   /* Are we in the middle of emitting a string? */
@@ -193,7 +229,7 @@
 ungetb_lzw(int c, BFILE *bp)
 {
   struct BFILE_lzw *p = (struct BFILE_lzw*)bp;
-  if (p->unget)
+  if (p->unget >= 0)
     ERR("ungetb_lzw");
   p->unget = c;
 }
@@ -214,15 +250,18 @@
 BFILE *
 add_lzw_decompressor(BFILE *file)
 {
-  struct BFILE_lzw *p = calloc(1, sizeof(struct BFILE_lzw));
+  struct BFILE_lzw *p = MALLOC(sizeof(struct BFILE_lzw));
   int i;
   
   if (!p)
     memerr();
+  memset(p, 0, sizeof(struct BFILE_lzw));
   p->mets.getb = getb_lzw;
   p->mets.ungetb = ungetb_lzw;
+  p->mets.putb = 0;             /* no compressor yet. */
   p->mets.closeb = closeb_lzw;
   p->bfile = file;
+  p->unget = -1;
 
   /* initialize dictionary with printable ASCII */
   for(i = 0; i < ASCIISIZE-1; i++) {
@@ -240,3 +279,111 @@
   return (BFILE *)p;
 }
 
+/***************** BFILE with UTF8 encode/decode *******************/
+
+struct BFILE_utf8 {
+  BFILE    mets;
+  BFILE    *bfile;
+  int      unget;
+};
+
+int
+getb_utf8(BFILE *bp)
+{
+  struct BFILE_utf8 *p = (struct BFILE_utf8*)bp;
+  int c1, c2, c3, c4;
+
+  /* Do we have an ungetb character? */
+  if (p->unget >= 0) {
+    c1 = p->unget;
+    p->unget = -1;
+    return c1;
+  }
+  c1 = p->bfile->getb(p->bfile);
+  if (c1 < 0)
+    return -1;
+  if ((c1 & 0x80) == 0)
+    return c1;
+  c2 = p->bfile->getb(p->bfile);
+  if (c2 < 0)
+    return -1;
+  if ((c1 & 0xe0) == 0xc0)
+    return ((c1 & 0x1f) << 6) | (c2 & 0x3f);
+  c3 = p->bfile->getb(p->bfile);
+  if (c3 < 0)
+    return -1;
+  if ((c1 & 0xf0) == 0xe0)
+    return ((c1 & 0x0f) << 12) | ((c2 & 0x3f) << 6) | (c3 & 0x3f);
+  c4 = p->bfile->getb(p->bfile);
+  if (c4 < 0)
+    return -1;
+  if ((c1 & 0xf8) == 0xf0)
+    return ((c1 & 0x07) << 18) | ((c2 & 0x3f) << 12) | ((c3 & 0x3f) << 6) | (c4 & 0x3f);
+  ERR("getb_utf8");
+}
+
+void
+ungetb_utf8(int c, BFILE *bp)
+{
+  struct BFILE_utf8 *p = (struct BFILE_utf8*)bp;
+  if (p->unget >= 0)
+    ERR("ungetb_utf8");
+  p->unget = c;
+}
+
+void
+putb_utf8(int c, BFILE *bp)
+{
+  struct BFILE_utf8 *p = (struct BFILE_utf8 *)bp;
+  if (c < 0)
+    ERR("putb_utf8: < 0");
+  if (c < 0x80) {
+    p->bfile->putb(c, p->bfile);
+    return;
+  }
+  if (c < 0x800) {
+    p->bfile->putb(((c >> 6 )       ) | 0xc0, p->bfile);
+    p->bfile->putb(((c      ) & 0x3f) | 0x80, p->bfile);
+    return;
+  }
+  if (c < 0x10000) {
+    p->bfile->putb(((c >> 12)       ) | 0xe0, p->bfile);
+    p->bfile->putb(((c >> 6 ) & 0x3f) | 0x80, p->bfile);
+    p->bfile->putb(((c      ) & 0x3f) | 0x80, p->bfile);
+    return;
+  }
+  if (c < 0x110000) {
+    p->bfile->putb(((c >> 18)       ) | 0xf0, p->bfile);
+    p->bfile->putb(((c >> 12) & 0x3f) | 0x80, p->bfile);
+    p->bfile->putb(((c >> 6 ) & 0x3f) | 0x80, p->bfile);
+    p->bfile->putb(((c      ) & 0x3f) | 0x80, p->bfile);
+    return;
+  }
+  ERR("putb_utf8");
+}
+
+void
+closeb_utf8(BFILE *bp)
+{
+  struct BFILE_utf8 *p = (struct BFILE_utf8*)bp;
+
+  p->bfile->closeb(p->bfile);
+  FREE(p);
+}
+
+BFILE *
+add_utf8(BFILE *file)
+{
+  struct BFILE_utf8 *p = MALLOC(sizeof(struct BFILE_utf8));
+  
+  if (!p)
+    memerr();
+  p->mets.getb = getb_utf8;
+  p->mets.ungetb = ungetb_utf8;
+  p->mets.putb = putb_utf8;
+  p->mets.closeb = closeb_utf8;
+  p->bfile = file;
+  p->unget = -1;
+
+  return (BFILE*)p;
+}
--- a/src/runtime/eval.c
+++ b/src/runtime/eval.c
@@ -1018,11 +1018,11 @@
 int
 gobble(BFILE *f, int c)
 {
-  int d = f->getb(f);
+  int d = getb(f);
   if (c == d) {
     return 1;
   } else {
-    f->ungetb(d, f);
+    ungetb(d, f);
     return 0;
   }
 }
@@ -1033,9 +1033,9 @@
 {
   int c;
   
-  c = f->getb(f);
+  c = getb(f);
   if (c == ' ' || c == ')') {
-    f->ungetb(c, f);
+    ungetb(c, f);
     return 0;
   } else {
     return c;
@@ -1047,16 +1047,16 @@
 {
   value_t i = 0;
   value_t neg = 1;
-  int c = f->getb(f);
+  int c = getb(f);
   if (c == '-') {
     neg = -1;
-    c = f->getb(f);
+    c = getb(f);
   }
   for(;;) {
     i = i * 10 + c - '0';
-    c = f->getb(f);
+    c = getb(f);
     if (c < '0' || c > '9') {
-      f->ungetb(c, f);
+      ungetb(c, f);
       break;
     }
   }
@@ -1129,7 +1129,7 @@
   if (!buffer)
     memerr();
   for(i = 0;;) {
-    c = f->getb(f);
+    c = getb(f);
     if (c == '"')
       break;
     if (i >= sz) {
@@ -1161,7 +1161,7 @@
   int c;
   char buf[80];                 /* store names of primitives. */
 
-  c = f->getb(f);
+  c = getb(f);
   if (c < 0) ERR("parse EOF");
   switch (c) {
   case '(' :
@@ -1260,7 +1260,7 @@
   int c;
 
   while ((c = *p++)) {
-    if (c != f->getb(f))
+    if (c != getb(f))
       ERR("version mismatch");
   }
   gobble(f, '\r');                 /* allow extra CR */
@@ -1293,7 +1293,7 @@
   BFILE *p = openb_FILE(f);
   /* And parse it */
   NODEPTR n = parse_top(p);
-  p->closeb(p);
+  closeb(p);
   return n;
 }
 
@@ -2654,9 +2654,9 @@
 
   if (combexpr) {
     int c;
-    struct BFILE_buffer ibf = { { getb_buf, ungetb_buf, closeb_buf }, combexprlen, 0, combexpr };
+    struct BFILE_buffer ibf = { { getb_buf, ungetb_buf, 0, closeb_buf }, combexprlen, 0, combexpr };
     BFILE *bf = (BFILE*)&ibf;
-    c = bf->getb(bf);
+    c = getb(bf);
     /* Compressed combinators start with a 'Z', otherwise 'v' (for version) */
     if (c == 'Z') {
       /* add compressor transducer */
@@ -2663,10 +2663,10 @@
       bf = add_lzw_decompressor(bf);
     } else {
       /* put it back, we need it */
-      bf->ungetb(c, bf);
+      ungetb(c, bf);
     }
     prog = parse_top(bf);
-    bf->closeb(bf);
+    closeb(bf);
   } else {
 #if WANT_STDIO
     prog = parse_file(inname, &file_size);
--