ref: d35571cbb83b144805ef9158233f3b148a32ff2d
parent: 7192dd8d0b98e7ccddb00dea5be7e4e342e29025
author: Sigrid Solveig Haflínudóttir <ftrvxmtrx@gmail.com>
date: Sat Jan 16 17:32:10 EST 2021
add xml parser
--- a/README.md
+++ b/README.md
@@ -8,3 +8,4 @@
* `msr.c` MSR reading tool
* `nanosec.c` nanosec(), a replacement for (way more expensive) nsec()
* `qt.[ch]` [QP tries](https://dotat.at/prog/qp/README.html)
+* `xml.[ch]` XML parser, works as a streaming parser as well
--- /dev/null
+++ b/xml.c
@@ -1,0 +1,319 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <ctype.h>
+#include "xml.h"
+
+static char *escmap[] =
+{
+ "\x06\""",
+ "\x06\''",
+ "\x04<<",
+ "\x04>>",
+ "\x05&&",
+};
+
+enum
+{
+ Xmlvalue = 2,
+};
+
+static char *
+unxml(char *orig)
+{
+ char *s, *o, *e;
+ int i, rsz;
+ Rune r;
+
+ for(s = orig, o = orig; *s != 0;){
+next:
+ if(*s == '\r'){
+ *o++ = '\n';
+ s += s[1] == '\n' ? 2 : 1;
+ continue;
+ }
+
+ rsz = chartorune(&r, s);
+
+ if(r == '&'){
+ if(s[1] == '#' && (e = strchr(s+2, ';')) != nil && e != s+2){
+ s += 2;
+ if(*s == 'x'){
+ *s = '0';
+ o += dec16((uchar*)o, e-o, s, e-s);
+ }else if(isdigit(*s)){
+ *o++ = atoi(s);
+ }
+ s = e+1;
+ continue;
+ }else{
+ for(i = 0; i < nelem(escmap); i++){
+ if(strncmp(s, &escmap[i][2], escmap[i][0]) == 0){
+ *o++ = escmap[i][1];
+ s += escmap[i][0] - 1;
+ goto next;
+ }
+ }
+ }
+ }
+
+ memmove(o, s, rsz);
+ s += rsz;
+ o += rsz;
+ }
+
+ *o = 0;
+ return orig;
+}
+
+static Xattr *
+xmlattr(char *s, int *err)
+{
+ Xattr *a, *attrs;
+ char *p;
+
+ attrs = nil;
+ *err = 0;
+
+ for(; *s;){
+ a = mallocz(sizeof(*a), 1);
+ a->n = s;
+ for(; *s && *s != '='; s++);
+ if(*s != '='){
+ werrstr("xml sucks (%d)", *s);
+ goto error;
+ }
+ *s++ = 0;
+ if(*s != '\'' && *s != '\"'){
+ werrstr("xml is complicated (%d)", *s);
+ goto error;
+ }
+ a->v = s+1;
+ s = utfrune(a->v, *s);
+ if(s == nil){
+ werrstr("xml is broken");
+ goto error;
+ }
+ *s++ = 0;
+ a->next = attrs;
+ a->n = unxml(a->n);
+ a->v = unxml(a->v);
+ attrs = a;
+ if(*s == ' ')
+ s++;
+ if((p = strchr(a->n, ':')) != nil && strncmp(p, ":zdef", 5) == 0)
+ *p = 0;
+ }
+
+ return attrs;
+error:
+ *err = 1;
+ free(a);
+ for(; attrs != nil; attrs = a){
+ a = attrs->next;
+ free(attrs);
+ }
+ return nil;
+}
+
+static Xelem *
+xmlread_(Biobufhdr *h, Xelem *par, int flags)
+{
+ char *s, *t;
+ Xelem *x, *ch;
+ int r, closed, len, err;
+
+ x = nil;
+
+ for(;;){
+ r = Bgetrune(h);
+ if(r < 0){
+ werrstr("xmlread: %r");
+ goto error;
+ }
+ if(r == '<')
+ break;
+ if(isspacerune(r))
+ continue;
+ if(flags & Xmlvalue && par != nil){
+ Bungetrune(h);
+ if((s = Brdstr(h, '<', 1)) == nil){
+ werrstr("xmlread: %r");
+ goto error;
+ }
+ par->v = unxml(s);
+ if((s = Brdstr(h, '>', 1)) == nil){
+ free(par->v);
+ par->v = nil;
+ werrstr("xmlread: %r");
+ }
+ free(s);
+ return nil;
+ }
+ werrstr("xmlread: unexpected rune (%C)", r);
+ goto error;
+ }
+
+ s = Brdstr(h, '>', 1);
+ if(s == nil){
+ werrstr("xmlread: %r");
+ goto error;
+ }
+ if(s[0] == '/'){
+ free(s);
+ return nil;
+ }
+ if(s[0] == '?'){
+ free(s);
+ return xmlread_(h, par, flags);
+ }
+
+ x = mallocz(sizeof(*x), 1);
+ x->priv = s;
+ x->n = s;
+
+ if(strncmp(x->n, "zdef", 4) == 0){
+ if((x->n = strchr(x->n, ':')) == nil){
+ werrstr("xmlread: zdef without ':'");
+ goto error;
+ }
+ x->n += 1;
+ }
+
+ len = strlen(s);
+ if(s[len-1] == '/' || s[len-1] == '?'){
+ closed = 1;
+ s[len-1] = 0;
+ }else
+ closed = flags & Xmlstartonly;
+
+ for(; *s && *s != ' '; s++);
+ if(*s){
+ *s++ = 0;
+ x->a = xmlattr(s, &err);
+ if(err != 0)
+ goto error;
+ }
+
+ if(strcmp(x->n, "html") == 0){
+ for(len = 0;; len += r){
+ s = Brdstr(h, '>', 0);
+ if(s == nil){
+ werrstr("xmlread: %r");
+ goto error;
+ }
+
+ r = strlen(s);
+ x->v = realloc(x->v, len + r + 1);
+ if(x->v == nil){
+ werrstr("xmlread: %r");
+ goto error;
+ }
+ strcpy(x->v+len, s);
+ free(s);
+ t = strstr(x->v+len, "</html>");
+ if(t != nil){
+ *t = 0;
+ return x;
+ }
+ }
+ }
+
+ if(!closed){
+ for(;;){
+ flags = Xmlvalue;
+ ch = xmlread_(h, x, flags);
+ if(ch == nil)
+ break;
+ ch->next = x->ch;
+ x->ch = ch;
+ }
+ }
+
+ return x;
+
+error:
+ xmlfree(x);
+ return nil;
+}
+
+Xelem *
+xmlread(Biobuf *b, int flags)
+{
+ return xmlread_(b, nil, flags & Xmlstartonly);
+}
+
+void
+xmlfree(Xelem *x)
+{
+ Xattr *a, *ta;
+ Xelem *n, *n2;
+
+ if(x == nil)
+ return;
+
+ xmlfree(x->ch);
+ free(x->v);
+ x->ch = nil;
+ x->v = nil;
+ free(x->priv);
+ for(a = x->a; a != nil; a = ta){
+ ta = a->next;
+ free(a);
+ }
+
+ for(n = x->next; n != nil; n = n2){
+ n2 = n->next;
+ n->next = nil;
+ xmlfree(n);
+ }
+
+ free(x);
+}
+
+Xelem *
+xmlget(Xelem *x, char *path, ...)
+{
+ char **s;
+
+ for(s = &path; *s != nil; s++){
+ for(x = x->ch; x != nil && strcmp(x->n, *s) != 0; x = x->next);
+ if(x == nil)
+ return nil;
+ }
+
+ return x;
+}
+
+Xattr *
+xmlgetattr(Xattr *a, char *name)
+{
+ for(; a != nil; a = a->next)
+ if(strcmp(a->n, name) == 0)
+ return a;
+ return nil;
+}
+
+static void
+xmlprint_(Xelem *x, int fd, int off)
+{
+ Xattr *a;
+
+ for(; x != nil; x = x->next){
+ fprint(fd, "%*c%q", off, ' ', x->n);
+ if(x->v != nil)
+ fprint(fd, "=%#q", x->v);
+ for(a = x->a; a != nil; a = a->next)
+ fprint(fd, " %q=%#q", a->n, a->v);
+ fprint(fd, "\n");
+ off += 4;
+ xmlprint_(x->ch, fd, off);
+ off -= 4;
+ }
+}
+
+void
+xmlprint(Xelem *x, int fd)
+{
+ xmlprint_(x, fd, 0);
+}
--- /dev/null
+++ b/xml.h
@@ -1,0 +1,30 @@
+typedef struct Xelem Xelem;
+typedef struct Xattr Xattr;
+
+struct Xelem
+{
+ char *n;
+ char *v;
+ Xattr *a;
+ Xelem *ch;
+ Xelem *next;
+ void *priv;
+};
+
+struct Xattr
+{
+ char *n;
+ char *v;
+ Xattr *next;
+};
+
+enum
+{
+ Xmlstartonly = 1,
+};
+
+Xelem *xmlread(Biobuf *b, int flags);
+void xmlfree(Xelem *x);
+Xelem *xmlget(Xelem *x, char *path, ...);
+Xattr *xmlgetattr(Xattr *a, char *n);
+void xmlprint(Xelem *x, int fd);