ref: cb273846e4702b096feb761d26c3cff82f2f22df
dir: /js-lex.c/
#include "js.h"
#define nelem(a) (sizeof (a) / sizeof (a)[0])
static const char *keywords[] = {
"break", "case", "catch", "continue", "debugger", "default", "delete",
"do", "else", "false", "finally", "for", "function", "if", "in",
"instanceof", "new", "null", "return", "switch", "this", "throw",
"true", "try", "typeof", "var", "void", "while", "with",
};
static const char *futurewords[] = {
"class", "const", "enum", "export", "extends", "import", "super",
};
static const char *strictfuturewords[] = {
"implements", "interface", "let", "package", "private", "protected",
"public", "static", "yield",
};
static inline int findword(const char *s, const char **list, int num)
{
int l = 0;
int r = num - 1;
while (l <= r) {
int m = (l + r) >> 1;
int c = strcmp(s, list[m]);
if (c < 0)
r = m - 1;
else if (c > 0)
l = m + 1;
else
return m;
}
return -1;
}
static inline js_Token findkeyword(js_State *J, const char *s)
{
int i = findword(s, keywords, nelem(keywords));
if (i >= 0)
return JS_BREAK + i;
if (findword(s, futurewords, nelem(futurewords)) >= 0)
return js_syntaxerror(J, "'%s' is a future reserved word", s);
if (J->strict && findword(s, strictfuturewords, nelem(strictfuturewords)) >= 0)
return js_syntaxerror(J, "'%s' is a strict mode future reserved word", s);
return JS_IDENTIFIER;
}
const char *tokenstrings[] = {
"(error)", "(eof)", "(identifier)", "null", "true", "false",
"(number)", "(string)", "(regexp)", "(newline)",
"{", "}", "(", ")", "[", "]", ".", ";", ",",
"<", ">", "<=", ">=", "==", "!=", "===", "!==",
"+", "-", "*", "%", "++", "--", "<<", ">>", ">>>", "&", "|",
"^", "!", "~", "&&", "||", "?", ":",
"=", "+=", "-=", "*=", "%=", "<<=", ">>=", ">>>=", "&=", "|=", "^=",
"/", "/=",
"break", "case", "catch", "continue", "debugger", "default", "delete",
"do", "else", "finally", "for", "function", "if", "in", "instanceof",
"new", "return", "switch", "this", "throw", "try", "typeof", "var",
"void", "while", "with",
};
const char *js_tokentostring(js_Token t)
{
return tokenstrings[t];
}
#define GET() (*(*sp)++)
#define UNGET() ((*sp)--)
#define PEEK() (**sp)
#define NEXT() ((*sp)++)
#define NEXTPEEK() (NEXT(), PEEK())
#define LOOK(x) (PEEK() == x ? (NEXT(), 1) : 0)
js_Token js_syntaxerror(js_State *J, const char *fmt, ...)
{
va_list ap;
fprintf(stderr, "syntax error: line %d: ", J->yyline);
va_start(ap, fmt);
vfprintf(stderr, fmt, ap);
va_end(ap);
fprintf(stderr, "\n");
return JS_ERROR;
}
static void textinit(js_State *J)
{
if (!J->yytext) {
J->yycap = 4096;
J->yytext = malloc(J->yycap);
}
J->yylen = 0;
}
static inline void textpush(js_State *J, int c)
{
if (J->yylen >= J->yycap) {
J->yycap = J->yycap * 2;
J->yytext = realloc(J->yytext, J->yycap);
}
J->yytext[J->yylen++] = c;
}
static inline void textend(js_State *J)
{
textpush(J, 0);
}
static inline int iswhite(int c)
{
return c == 0x9 || c == 0xb || c == 0xc || c == 0x20 || c == 0xa0;
}
static inline int isnewline(c)
{
return c == 0xa || c == 0xd || c == 0x2028 || c == 0x2029;
}
static inline int isidentifierstart(int c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '$' || c == '_';
}
static inline int isidentifierpart(int c)
{
return (c >= '0' && c <= '9') || isidentifierstart(c);
}
static inline int isdec(int c)
{
return (c >= '0' && c <= '9');
}
static inline int ishex(int c)
{
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}
static inline int tohex(int c)
{
if (c >= '0' && c <= '9')
return c - '0';
if (c >= 'a' && c <= 'f')
return c - 'a' + 0xa;
if (c >= 'A' && c <= 'F')
return c - 'A' + 0xa;
return 0;
}
static inline void lexlinecomment(const char **sp)
{
int c = PEEK();
while (c && !isnewline(c)) {
c = NEXTPEEK();
}
}
static inline int lexcomment(const char **sp)
{
while (1) {
int c = GET();
if (c == '*') {
while (c == '*')
c = GET();
if (c == '/')
return 0;
} else if (c == 0) {
return -1;
}
}
}
static inline double lexhex(const char **sp)
{
double n = 0;
int c = PEEK();
while (ishex(c)) {
n = n * 16 + tohex(c);
c = NEXTPEEK();
}
return n;
}
static inline double lexinteger(const char **sp)
{
double n = 0;
int c = PEEK();
while (isdec(c)) {
n = n * 10 + (c - '0');
c = NEXTPEEK();
}
return n;
}
static inline double lexfraction(const char **sp)
{
double n = 0;
double d = 1;
int c = PEEK();
while (isdec(c)) {
n = n * 10 + (c - '0');
d = d * 10;
c = NEXTPEEK();
}
return n / d;
}
static inline double lexexponent(const char **sp)
{
if (LOOK('e') || LOOK('E')) {
if (LOOK('-'))
return -lexinteger(sp);
else if (LOOK('+'))
return lexinteger(sp);
else
return lexinteger(sp);
}
return 0;
}
static inline js_Token lexnumber(js_State *J, const char **sp)
{
double n;
if ((*sp)[0] == '0' && ((*sp)[1] == 'x' || (*sp)[1] == 'X')) {
*sp += 2;
if (!ishex(PEEK()))
return js_syntaxerror(J, "0x not followed by hexademical digit");
J->yynumber = lexhex(sp);
return JS_NUMBER;
}
if ((*sp)[0] == '0' && isdec((*sp)[1]))
return js_syntaxerror(J, "number with leading zero");
n = lexinteger(sp);
if (LOOK('.'))
n += lexfraction(sp);
n *= pow(10, lexexponent(sp));
if (isidentifierstart(PEEK()))
return js_syntaxerror(J, "number with letter suffix");
J->yynumber = n;
return JS_NUMBER;
}
static inline int lexescape(const char **sp)
{
int c = GET();
int x = 0;
switch (c) {
case '0': return 0;
case 'u':
if (!ishex(PEEK())) return x; else x |= NEXTPEEK() << 12;
if (!ishex(PEEK())) return x; else x |= NEXTPEEK() << 8;
if (!ishex(PEEK())) return x; else x |= NEXTPEEK() << 4;
if (!ishex(PEEK())) return x; else x |= NEXTPEEK();
return x;
case 'x':
if (!ishex(PEEK())) return x; else x |= NEXTPEEK() << 4;
if (!ishex(PEEK())) return x; else x |= NEXTPEEK();
return x;
case '\'': return '\'';
case '"': return '"';
case '\\': return '\\';
case 'b': return '\b';
case 'f': return '\f';
case 'n': return '\n';
case 'r': return '\r';
case 't': return '\t';
case 'v': return '\v';
default: return c;
}
}
static inline js_Token lexstring(js_State *J, const char **sp, int q)
{
int c = GET();
textinit(J);
while (c != q) {
if (c == 0 || isnewline(c))
return js_syntaxerror(J, "string not terminated");
if (c == '\\')
c = lexescape(sp);
textpush(J, c);
c = GET();
}
textend(J);
return JS_STRING;
}
/* the ugliest language wart ever... */
static int isregexpcontext(js_Token last)
{
switch (last)
{
case JS_IDENTIFIER:
case JS_NULL:
case JS_TRUE:
case JS_FALSE:
case JS_THIS:
case JS_NUMBER:
case JS_STRING:
case JS_RSQUARE:
case JS_RPAREN:
return 0;
default:
return 1;
}
}
static js_Token lexregexp(js_State *J, const char **sp)
{
int c;
textinit(J);
/* regexp body */
c = GET();
while (c != '/') {
if (c == 0 || isnewline(c)) {
return js_syntaxerror(J, "regular expression not terminated");
} else if (c == '\\') {
textpush(J, c);
c = GET();
if (c == 0 || isnewline(c))
return js_syntaxerror(J, "regular expression not terminated");
textpush(J, c);
c = GET();
} else {
textpush(J, c);
c = GET();
}
}
textend(J);
/* regexp flags */
J->yyflags.g = J->yyflags.i = J->yyflags.m = 0;
c = PEEK();
while (isidentifierpart(c)) {
if (c == 'g') J->yyflags.g ++;
else if (c == 'i') J->yyflags.i ++;
else if (c == 'm') J->yyflags.m ++;
else return js_syntaxerror(J, "illegal flag in regular expression: %c", c);
c = NEXTPEEK();
}
if (J->yyflags.g > 1 || J->yyflags.i > 1 || J->yyflags.m > 1)
return js_syntaxerror(J, "duplicated flag in regular expression");
return JS_REGEXP;
}
static js_Token js_leximp(js_State *J, const char **sp)
{
int c = GET();
while (c) {
while (iswhite(c))
c = GET();
if (isnewline(c)) {
/* consume CR LF as one unit */
if (c == '\r' && PEEK() == '\n')
NEXT();
J->yyline++;
return JS_NEWLINE;
}
if (c == '/') {
if (LOOK('/')) {
lexlinecomment(sp);
} else if (LOOK('*')) {
if (lexcomment(sp))
return js_syntaxerror(J, "multi-line comment not terminated");
} else if (isregexpcontext(J->lasttoken)) {
return lexregexp(J, sp);
} else if (LOOK('=')) {
return JS_SLASH_EQ;
} else {
return JS_SLASH;
}
}
if (isidentifierstart(c)) {
textinit(J);
textpush(J, c);
c = PEEK();
while (isidentifierpart(c)) {
textpush(J, c);
c = NEXTPEEK();
}
textend(J);
return findkeyword(J, J->yytext);
}
if (c == '.') {
if (isdec(PEEK())) {
UNGET();
return lexnumber(J, sp);
}
return JS_PERIOD;
}
if (c >= '0' && c <= '9') {
UNGET();
return lexnumber(J, sp);
}
if (c == '\'' || c == '"')
return lexstring(J, sp, c);
switch (c) {
case '{': return JS_LCURLY;
case '}': return JS_RCURLY;
case '(': return JS_LPAREN;
case ')': return JS_RPAREN;
case '[': return JS_LSQUARE;
case ']': return JS_RSQUARE;
case '.': return JS_PERIOD;
case ';': return JS_SEMICOLON;
case ',': return JS_COMMA;
case '<':
if (LOOK('<')) {
if (LOOK('='))
return JS_LT_LT_EQ;
return JS_LT_LT;
}
if (LOOK('='))
return JS_LT_EQ;
return JS_LT;
case '>':
if (LOOK('>')) {
if (LOOK('>')) {
if (LOOK('='))
return JS_GT_GT_GT_EQ;
return JS_GT_GT_GT;
}
if (LOOK('='))
return JS_GT_GT_EQ;
return JS_GT_GT;
}
if (LOOK('='))
return JS_GT_EQ;
return JS_GT;
case '=':
if (LOOK('=')) {
if (LOOK('='))
return JS_EQ_EQ_EQ;
return JS_EQ_EQ;
}
return JS_EQ;
case '!':
if (LOOK('=')) {
if (LOOK('='))
return JS_EXCL_EQ_EQ;
return JS_EXCL_EQ;
}
return JS_EXCL;
case '+':
if (LOOK('+'))
return JS_PLUS_PLUS;
if (LOOK('='))
return JS_PLUS_EQ;
return JS_PLUS;
case '-':
if (LOOK('-'))
return JS_MINUS_MINUS;
if (LOOK('='))
return JS_MINUS_EQ;
return JS_MINUS;
case '*':
if (LOOK('='))
return JS_STAR_EQ;
return JS_STAR;
case '%':
if (LOOK('='))
return JS_PERCENT_EQ;
return JS_PERCENT;
case '&':
if (LOOK('&'))
return JS_AND_AND;
if (LOOK('='))
return JS_AND_EQ;
return JS_AND;
case '|':
if (LOOK('|'))
return JS_BAR_BAR;
if (LOOK('='))
return JS_BAR_EQ;
return JS_BAR;
case '^':
if (LOOK('='))
return JS_HAT_EQ;
return JS_HAT;
case '~': return JS_TILDE;
case '?': return JS_QUESTION;
case ':': return JS_COLON;
}
c = GET();
}
return JS_EOF;
}
js_Token js_lex(js_State *J, const char **sp)
{
js_Token t = js_leximp(J, sp);
J->lasttoken = t;
return t;
}
void js_initlex(js_State *J)
{
J->yyline = 1;
J->lasttoken = JS_ERROR;
}