shithub: rgbds

--- a/include/asm/asm.h

+++ b/include/asm/asm.h

@@ -27,9 +27,5 @@

 extern uint32_t nTotalLines;

 extern uint32_t nIFDepth;

 extern struct Section *pCurrentSection;

-extern bool oDontExpandStrings;

-size_t symvaluetostring(char *dest, size_t maxLength, char *sym,

-			const char *mode);

 #endif /* RGBDS_ASM_ASM_H */

--- a/include/asm/fstack.h

+++ b/include/asm/fstack.h

@@ -32,7 +32,7 @@

 	uint32_t uniqueID;

 	int32_t nLine;

 	uint32_t nStatus;

-	char *pREPTBlock;

+	char const *pREPTBlock;

 	uint32_t nREPTBlockCount;

 	uint32_t nREPTBlockSize;

 	int32_t nREPTBodyFirstLine;

@@ -47,7 +47,7 @@

 void fstk_DumpToStr(char *buf, size_t len);

 void fstk_AddIncludePath(char *s);

 void fstk_RunMacro(char *s, struct MacroArgs *args);

-void fstk_RunRept(uint32_t count, int32_t nReptLineNo);

+void fstk_RunRept(uint32_t count, int32_t nReptLineNo, char const *body, size_t size);

/**

  * @param path The user-provided file name

  * @param fullPath The address of a pointer, which will be made to point at the full path

--- a/include/asm/lexer.h

+++ b/include/asm/lexer.h

@@ -33,10 +33,13 @@

 struct LexerState *lexer_OpenFile(char const *path);

 struct LexerState *lexer_OpenFileView(void);

 void lexer_DeleteState(struct LexerState *state);

+void lexer_Init(void);

 enum LexerMode {

 	LEXER_NORMAL,

-	LEXER_RAW

+	LEXER_RAW,

+	LEXER_SKIP_TO_ELIF,

+	LEXER_SKIP_TO_ENDC

};

 void lexer_SetMode(enum LexerMode mode);

@@ -47,7 +50,7 @@

 uint32_t lexer_GetColNo(void);

 void lexer_DumpStringExpansions(void);

 int yylex(void);

-void lexer_SkipToBlockEnd(int blockStartToken, int blockEndToken, int endToken,

-			  char const **capture, size_t *size, char const *name);

+void lexer_CaptureBlock(int blockStartToken, int blockEndToken, char const **capture, size_t *size,

+			char const *name);

 #endif /* RGBDS_ASM_LEXER_H */

--- a/include/asm/symbol.h

+++ b/include/asm/symbol.h

@@ -44,8 +44,8 @@

 			int32_t (*callback)(void);

};

 		struct { /* For SYM_MACRO */

-			uint32_t macroSize;

-			char *macro;

+			size_t macroSize;

+			char const *macro;

};

};

@@ -114,9 +114,10 @@

 struct Symbol *sym_AddEqu(char const *symName, int32_t value);

 struct Symbol *sym_AddSet(char const *symName, int32_t value);

 uint32_t sym_GetPCValue(void);

+uint32_t sym_GetConstantSymValue(struct Symbol const *sym);

 uint32_t sym_GetConstantValue(char const *s);

 struct Symbol *sym_FindSymbol(char const *symName);

-struct Symbol *sym_AddMacro(char const *symName, int32_t defLineNo);

+struct Symbol *sym_AddMacro(char const *symName, int32_t defLineNo, char const *body, size_t size);

 struct Symbol *sym_Ref(char const *symName);

 struct Symbol *sym_AddString(char const *symName, char const *value);

 uint32_t sym_GetDefinedValue(char const *s);

--- a/include/asm/util.h

+++ b/include/asm/util.h

@@ -12,6 +12,7 @@

 #include <stdint.h>

 uint32_t calchash(const char *s);

+char const *print(char c);

 size_t readUTF8Char(uint8_t *dest, char const *src);

 #endif /* RGBDS_UTIL_H */

--- a/src/asm/asmy.y

+++ b/src/asm/asmy.y

@@ -39,64 +39,8 @@

 char *tzNewMacro;

 uint32_t ulNewMacroSize;

 int32_t nPCOffset;

-bool skipElifs; /* If this is set, ELIFs cannot be executed anymore */

+bool executedIfBlock; /* If this is set, ELIFs cannot be executed anymore */

-size_t symvaluetostring(char *dest, size_t maxLength, char *symName,

-			const char *mode)

-{

-	size_t length;

-	struct Symbol *sym = sym_FindSymbol(symName);

-	if (sym && sym->type == SYM_EQUS) {

-		char const *src = sym_GetStringValue(sym);

-		size_t i;

-		if (mode)

-			error("Print types are only allowed for numbers\n");

-		for (i = 0; src[i] != 0; i++) {

-			if (i >= maxLength)

-				fatalerror("Symbol value too long to fit buffer\n");

-			dest[i] = src[i];

-		}

-		length = i;

-	} else {

-		uint32_t value = sym_GetConstantValue(symName);

-		int32_t fullLength;

-		/* Special cheat for binary */

-		if (mode && !mode[0]) {

-			char binary[33]; /* 32 bits + 1 terminator */

-			char *write_ptr = binary + 32;

-			fullLength = 0;

-			binary[32] = 0;

-			do {

-				*(--write_ptr) = (value & 1) + '0';

-				value >>= 1;

-				fullLength++;

-			} while(value);

-			strncpy(dest, write_ptr, maxLength + 1);

-		} else {

-			fullLength = snprintf(dest, maxLength + 1,

-							  mode ? mode : "$%" PRIX32,

-						      value);

-		}

-		if (fullLength < 0) {

-			fatalerror("snprintf encoding error\n");

-		} else {

-			length = (size_t)fullLength;

-			if (length > maxLength)

-				fatalerror("Symbol value too long to fit buffer\n");

-		}

-	}

-	return length;

-}

 static uint32_t str2int2(uint8_t *s, int32_t length)

 	int32_t i;

@@ -388,18 +332,71 @@

 		| lines {

 			nListCountEmpty = 0;

 			nPCOffset = 0;

-		} line '\n' {

+		} line {

 			nTotalLines++;

-line		: label

-		| label cpu_command

-		| label macro

-		| label simple_pseudoop

-		| pseudoop

+line		: label '\n'

+		| label cpu_command '\n'

+		| label macro '\n'

+		| label simple_pseudoop '\n'

+		| pseudoop '\n'

+		| conditional /* May not necessarily be followed by a newline, see below */

+/*

+ * For "logistical" reasons, conditionals must manage newlines themselves.

+ * This is because we need to switch the lexer's mode *after* the newline has been read,

+ * and to avoid causing some grammar conflicts (token reducing is finicky).

+ * This is DEFINITELY one of the more FRAGILE parts of the codebase, handle with care.

+ */

+conditional	: if

+		/* It's important that all of these require being at line start for `skipIfBlock` */

+		| elif

+		| else

+		| endc

+;

+if		: T_POP_IF const '\n' {

+			nIFDepth++;

+			executedIfBlock = !!$2;

+			if (!executedIfBlock)

+				lexer_SetMode(LEXER_SKIP_TO_ELIF);

+		}

+;

+elif		: T_POP_ELIF const '\n' {

+			if (nIFDepth <= 0)

+				fatalerror("Found ELIF outside an IF construct\n");

+			if (executedIfBlock) {

+				lexer_SetMode(LEXER_SKIP_TO_ENDC);

+			} else {

+				executedIfBlock = !!$2;

+				if (!executedIfBlock)

+					lexer_SetMode(LEXER_SKIP_TO_ELIF);

+			}

+		}

+;

+else		: T_POP_ELSE '\n' {

+			if (nIFDepth <= 0)

+				fatalerror("Found ELSE outside an IF construct\n");

+			if (executedIfBlock)

+				lexer_SetMode(LEXER_SKIP_TO_ENDC);

+		}

+;

+endc		: T_POP_ENDC '\n' {

+			if (nIFDepth <= 0)

+				fatalerror("Found ENDC outside an IF construct\n");

+			nIFDepth--;

+		}

+;

 scoped_id	: T_ID | T_LOCAL_ID ;

 label		: /* empty */

@@ -460,10 +457,6 @@

 		| printt

 		| printv

 		| printi

-		| if

-		| elif

-		| else

-		| endc

 		| export

 		| db

 		| dw

@@ -606,9 +599,9 @@

 			uint32_t nDefinitionLineNo = lexer_GetLineNo();

 			char const *body;

 			size_t size;

-			lexer_SkipToBlockEnd(T_POP_REPT, T_POP_ENDR, T_POP_ENDR,

-					     &body, &size, "REPT block");

-			fstk_RunRept($2, nDefinitionLineNo);

+			lexer_CaptureBlock(T_POP_REPT, T_POP_ENDR, &body, &size,

+					   "REPT block");

+			fstk_RunRept($2, nDefinitionLineNo, body, size);

@@ -616,9 +609,9 @@

 			int32_t nDefinitionLineNo = lexer_GetLineNo();

 			char const *body;

 			size_t size;

-			lexer_SkipToBlockEnd(T_POP_MACRO, T_POP_ENDM, T_POP_ENDM,

-					     &body, &size, "macro definition");

-			sym_AddMacro($1, nDefinitionLineNo);

+			lexer_CaptureBlock(T_POP_MACRO, T_POP_ENDM, &body, &size,

+					   "macro definition");

+			sym_AddMacro($1, nDefinitionLineNo, body, size);

@@ -784,72 +777,6 @@

 printf		: T_POP_PRINTF const	{ math_Print($2); }

-;

-if		: T_POP_IF const {

-			nIFDepth++;

-			if (!$2) {

-				/* The function is hardcoded to also stop on T_POP_ELSE and ENDC */

-				lexer_SkipToBlockEnd(T_POP_IF, T_POP_ENDC, T_POP_ELIF,

-						     NULL, NULL, "if block");

-				skipElifs = false;

-			} else {

-				skipElifs = true;

-			}

-		}

-;

-elif		: T_POP_ELIF const {

-			if (nIFDepth <= 0)

-				fatalerror("Found ELIF outside an IF construct\n");

-			if (skipElifs) {

-				/*

-				 * Executed when ELIF is reached at the end of

-				 * an IF or ELIF block for which the condition

-				 * was true.

-				 *

-				 * Continue parsing at ENDC keyword

-				 */

-				lexer_SkipToBlockEnd(T_POP_IF, T_POP_ENDC, T_POP_ENDC,

-						     NULL, NULL, "elif block");

-			} else {

-				/*

-				 * Executed when ELIF is skipped to because the

-				 * condition of the previous IF or ELIF block

-				 * was false.

-				 */

-				if (!$2) {

-					/*

-					 * Continue parsing after ELSE, or at

-					 * ELIF or ENDC keyword.

-					 */

-					lexer_SkipToBlockEnd(T_POP_IF, T_POP_ENDC, T_POP_ELIF,

-							     NULL, NULL, "elif block");

-				} else {

-					skipElifs = true;

-				}

-			}

-		}

-;

-else		: T_POP_ELSE {

-			if (nIFDepth <= 0)

-				fatalerror("Found ELSE outside an IF construct\n");

-			/* Continue parsing at ENDC keyword */

-			lexer_SkipToBlockEnd(T_POP_IF, T_POP_ENDC, T_POP_ENDC,

-					     NULL, NULL, "else block");

-		}

-;

-endc		: T_POP_ENDC {

-			if (nIFDepth <= 0)

-				fatalerror("Found ENDC outside an IF construct\n");

-			nIFDepth--;

-		}

 const_3bit	: const {

--- a/src/asm/fstack.c

+++ b/src/asm/fstack.c

@@ -41,7 +41,7 @@

 static int32_t NextIncPath;

 static uint32_t nMacroCount;

-static char *pCurrentREPTBlock;

+static char const *pCurrentREPTBlock;

 static uint32_t nCurrentREPTBlockSize;

 static uint32_t nCurrentREPTBlockCount;

 static int32_t nCurrentREPTBodyFirstLine;

@@ -249,9 +249,11 @@

 			pLastFile->nLine);

 		pLastFile = pLastFile->next;

+	char const *fileName = lexer_GetFileName();

-	fprintf(stderr, "%s(%" PRId32 ",%" PRId32 ")",

-		lexer_GetFileName(), lexer_GetLineNo(), lexer_GetColNo());

+	if (fileName)

+		fprintf(stderr, "%s(%" PRId32 ",%" PRId32 "): ",

+			fileName, lexer_GetLineNo(), lexer_GetColNo());

 void fstk_DumpToStr(char *buf, size_t buflen)

@@ -425,7 +427,7 @@

/*

  * Set up a repeat block for parsing

*/

-void fstk_RunRept(uint32_t count, int32_t nReptLineNo)

+void fstk_RunRept(uint32_t count, int32_t nReptLineNo, char const *body, size_t size)

 	if (count) {

 		pushcontext();

@@ -432,8 +434,8 @@

 		macro_SetUniqueID(nMacroCount++);

 		nCurrentREPTBlockCount = count;

 		nCurrentStatus = STAT_isREPTBlock;

-		nCurrentREPTBlockSize = ulNewMacroSize;

-		pCurrentREPTBlock = tzNewMacro;

+		nCurrentREPTBlockSize = size;

+		pCurrentREPTBlock = body;

 		nCurrentREPTBodyFirstLine = nReptLineNo + 1;

--- a/src/asm/lexer.c

+++ b/src/asm/lexer.c

@@ -9,8 +9,10 @@

 #include <sys/mman.h>

 #include <sys/stat.h>

 #include <assert.h>

+#include <ctype.h>

 #include <errno.h>

 #include <fcntl.h>

+#include <inttypes.h>

 #include <limits.h>

 #include <stdbool.h>

 #include <stdint.h>

@@ -19,13 +21,208 @@

 #include <string.h>

 #include <unistd.h>

+#include "extern/utf8decoder.h"

+#include "asm/asm.h"

 #include "asm/lexer.h"

+#include "asm/macro.h"

+#include "asm/main.h"

 #include "asm/rpn.h"

-#include "asm/symbol.h" /* For MAXSYMLEN in asmy.h */

+#include "asm/symbol.h"

+#include "asm/util.h"

 #include "asm/warning.h"

 /* Include this last so it gets all type & constant definitions */

 #include "asmy.h" /* For token definitions, generated from asmy.y */

+/*

+ * Identifiers that are also keywords are listed here. This ONLY applies to ones

+ * that would normally be matched as identifiers! Check out `yylex_NORMAL` to

+ * see how this is used.

+ * Tokens / keywords not handled here are handled in `yylex_NORMAL`'s switch.

+ */

+static struct KeywordMapping {

+	char const *name;

+	int token;

+} const keywords[] = {

+	/*

+	 * CAUTION when editing this: adding keywords will probably require extra nodes in the

+	 * `keywordDict` array. If you forget to, you will probably trip up an assertion, anyways.

+	 * Also, all entries in this array must be in uppercase for the dict to build correctly.

+	 */

+	{"ADC", T_Z80_ADC},

+	{"ADD", T_Z80_ADD},

+	{"AND", T_Z80_AND},

+	{"BIT", T_Z80_BIT},

+	{"CALL", T_Z80_CALL},

+	{"CCF", T_Z80_CCF},

+	{"CPL", T_Z80_CPL},

+	{"CP", T_Z80_CP},

+	{"DAA", T_Z80_DAA},

+	{"DEC", T_Z80_DEC},

+	{"DI", T_Z80_DI},

+	{"EI", T_Z80_EI},

+	{"HALT", T_Z80_HALT},

+	{"INC", T_Z80_INC},

+	{"JP", T_Z80_JP},

+	{"JR", T_Z80_JR},

+	{"LD", T_Z80_LD},

+	{"LDI", T_Z80_LDI},

+	{"LDD", T_Z80_LDD},

+	{"LDIO", T_Z80_LDIO},

+	{"LDH", T_Z80_LDIO},

+	{"NOP", T_Z80_NOP},

+	{"OR", T_Z80_OR},

+	{"POP", T_Z80_POP},

+	{"PUSH", T_Z80_PUSH},

+	{"RES", T_Z80_RES},

+	{"RETI", T_Z80_RETI},

+	{"RET", T_Z80_RET},

+	{"RLCA", T_Z80_RLCA},

+	{"RLC", T_Z80_RLC},

+	{"RLA", T_Z80_RLA},

+	{"RL", T_Z80_RL},

+	{"RRC", T_Z80_RRC},

+	{"RRCA", T_Z80_RRCA},

+	{"RRA", T_Z80_RRA},

+	{"RR", T_Z80_RR},

+	{"RST", T_Z80_RST},

+	{"SBC", T_Z80_SBC},

+	{"SCF", T_Z80_SCF},

+	{"SET", T_POP_SET},

+	{"SLA", T_Z80_SLA},

+	{"SRA", T_Z80_SRA},

+	{"SRL", T_Z80_SRL},

+	{"STOP", T_Z80_STOP},

+	{"SUB", T_Z80_SUB},

+	{"SWAP", T_Z80_SWAP},

+	{"XOR", T_Z80_XOR},

+	{"NZ", T_CC_NZ},

+	{"Z", T_CC_Z},

+	{"NC", T_CC_NC},

+	/* Handled in list of registers */

+	/* { "C", T_CC_C }, */

+	{"AF", T_MODE_AF},

+	{"BC", T_MODE_BC},

+	{"DE", T_MODE_DE},

+	{"HL", T_MODE_HL},

+	{"SP", T_MODE_SP},

+	{"A", T_TOKEN_A},

+	{"B", T_TOKEN_B},

+	{"C", T_TOKEN_C},

+	{"D", T_TOKEN_D},

+	{"E", T_TOKEN_E},

+	{"H", T_TOKEN_H},

+	{"L", T_TOKEN_L},

+	{"DEF", T_OP_DEF},

+	{"FRAGMENT", T_POP_FRAGMENT},

+	{"BANK", T_OP_BANK},

+	{"ALIGN", T_OP_ALIGN},

+	{"ROUND", T_OP_ROUND},

+	{"CEIL", T_OP_CEIL},

+	{"FLOOR", T_OP_FLOOR},

+	{"DIV", T_OP_FDIV},

+	{"MUL", T_OP_FMUL},

+	{"SIN", T_OP_SIN},

+	{"COS", T_OP_COS},

+	{"TAN", T_OP_TAN},

+	{"ASIN", T_OP_ASIN},

+	{"ACOS", T_OP_ACOS},

+	{"ATAN", T_OP_ATAN},

+	{"ATAN2", T_OP_ATAN2},

+	{"HIGH", T_OP_HIGH},

+	{"LOW", T_OP_LOW},

+	{"ISCONST", T_OP_ISCONST},

+	{"STRCMP", T_OP_STRCMP},

+	{"STRIN", T_OP_STRIN},

+	{"STRSUB", T_OP_STRSUB},

+	{"STRLEN", T_OP_STRLEN},

+	{"STRCAT", T_OP_STRCAT},

+	{"STRUPR", T_OP_STRUPR},

+	{"STRLWR", T_OP_STRLWR},

+	{"INCLUDE", T_POP_INCLUDE},

+	{"PRINTT", T_POP_PRINTT},

+	{"PRINTI", T_POP_PRINTI},

+	{"PRINTV", T_POP_PRINTV},

+	{"PRINTF", T_POP_PRINTF},

+	{"EXPORT", T_POP_EXPORT},

+	{"XDEF", T_POP_XDEF},

+	{"GLOBAL", T_POP_GLOBAL},

+	{"DS", T_POP_DS},

+	{"DB", T_POP_DB},

+	{"DW", T_POP_DW},

+	{"DL", T_POP_DL},

+	{"SECTION", T_POP_SECTION},

+	{"PURGE", T_POP_PURGE},

+	{"RSRESET", T_POP_RSRESET},

+	{"RSSET", T_POP_RSSET},

+	{"INCBIN", T_POP_INCBIN},

+	{"CHARMAP", T_POP_CHARMAP},

+	{"NEWCHARMAP", T_POP_NEWCHARMAP},

+	{"SETCHARMAP", T_POP_SETCHARMAP},

+	{"PUSHC", T_POP_PUSHC},

+	{"POPC", T_POP_POPC},

+	{"FAIL", T_POP_FAIL},

+	{"WARN", T_POP_WARN},

+	{"FATAL", T_POP_FATAL},

+	{"ASSERT", T_POP_ASSERT},

+	{"STATIC_ASSERT", T_POP_STATIC_ASSERT},

+	{"MACRO", T_POP_MACRO},

+	{"ENDM", T_POP_ENDM},

+	{"SHIFT", T_POP_SHIFT},

+	{"REPT", T_POP_REPT},

+	{"ENDR", T_POP_ENDR},

+	{"LOAD", T_POP_LOAD},

+	{"ENDL", T_POP_ENDL},

+	{"IF", T_POP_IF},

+	{"ELSE", T_POP_ELSE},

+	{"ELIF", T_POP_ELIF},

+	{"ENDC", T_POP_ENDC},

+	{"UNION", T_POP_UNION},

+	{"NEXTU", T_POP_NEXTU},

+	{"ENDU", T_POP_ENDU},

+	{"WRAM0", T_SECT_WRAM0},

+	{"VRAM", T_SECT_VRAM},

+	{"ROMX", T_SECT_ROMX},

+	{"ROM0", T_SECT_ROM0},

+	{"HRAM", T_SECT_HRAM},

+	{"WRAMX", T_SECT_WRAMX},

+	{"SRAM", T_SECT_SRAM},

+	{"OAM", T_SECT_OAM},

+	{"RB", T_POP_RB},

+	{"RW", T_POP_RW},

+	{"EQU", T_POP_EQU},

+	{"EQUS", T_POP_EQUS},

+	/*  Handled before in list of CPU instructions */

+	/* {"SET", T_POP_SET}, */

+	{"PUSHS", T_POP_PUSHS},

+	{"POPS", T_POP_POPS},

+	{"PUSHO", T_POP_PUSHO},

+	{"POPO", T_POP_POPO},

+	{"OPT", T_POP_OPT}

+};

 #define LEXER_BUF_SIZE 42 /* TODO: determine a sane value for this */

 /* This caps the size of buffer reads, and according to POSIX, passing more than SSIZE_MAX is UB */

 static_assert(LEXER_BUF_SIZE <= SSIZE_MAX);

@@ -60,6 +257,7 @@

 	bool atLineStart;

 	uint32_t lineNo;

 	uint32_t colNo;

+	int lastToken;

 	bool capturing; /* Whether the text being lexed should be captured */

 	size_t captureSize; /* Amount of text captured */

@@ -83,12 +281,17 @@

 	if (isStdin)

 		path = "<stdin>";

 	if (!state) {

-		error("Failed to open file \"%s\": %s\n", path, strerror(errno));

+		error("Failed to allocate memory for lexer state: %s\n", strerror(errno));

 		return NULL;

 	state->path = path;

 	state->fd = isStdin ? STDIN_FILENO : open(path, O_RDONLY);

+	if (state->fd == -1) {

+		error("Failed to open file \"%s\": %s\n", path, strerror(errno));

+		free(state);

+		return NULL;

+	}

 	state->isMmapped = false; /* By default, assume it won't be mmap()ed */

 	off_t size = lseek(state->fd, 0, SEEK_END);

@@ -121,10 +324,16 @@

 			state->isMmapped = true;

 			state->ptr = pa;

 			state->size = size;

+			if (verbose)

+				printf("File %s successfully mmap()ped\n", path);

 	if (!state->isMmapped) {

 		/* Sometimes mmap() fails or isn't available, so have a fallback */

+		if (verbose)

+			printf("File %s opened as regular, errno reports \"%s\"\n",

+			       path, strerror(errno));

 		lseek(state->fd, 0, SEEK_SET);

 		state->index = 0;

@@ -132,6 +341,7 @@

 	state->mode = LEXER_NORMAL;

 	state->atLineStart = true; /* yylex() will init colNo due to this */

 	state->lineNo = 0;

+	state->lastToken = 0;

 	state->capturing = false;

 	state->captureBuf = NULL;

@@ -156,6 +366,72 @@

 	free(state);

+struct KeywordDictNode {

+	/*

+	 * The identifier charset is (currently) 44 characters big. By storing entries for the

+	 * entire printable ASCII charset, minus lower-case due to case-insensitivity,

+	 * we only waste (0x60 - 0x20) - 70 = 20 entries per node, which should be acceptable.

+	 * In turn, this allows greatly simplifying checking an index into this array,

+	 * which should help speed up the lexer.

+	 */

+	uint16_t children[0x60 - ' '];

+	struct KeywordMapping const *keyword;

+/* Since the keyword structure is invariant, the min number of nodes is known at compile time */

+} keywordDict[336] = {0}; /* Make sure to keep this correct when adding keywords! */

+/* Convert a char into its index into the dict */

+static inline uint8_t dictIndex(char c)

+{

+	/* Translate uppercase to lowercase (roughly) */

+	if (c > 0x60)

+		c = c - ('a' - 'A');

+	return c - ' ';

+}

+void lexer_Init(void)

+{

+	/*

+	 * Build the dictionary of keywords. This could be done at compile time instead, however:

+	 *  - Doing so manually is a task nobody wants to undertake

+	 *  - It would be massively hard to read

+	 *  - Doing it within CC or CPP would be quite non-trivial

+	 *  - Doing it externally would require some extra work to use only POSIX tools

+	 *  - The startup overhead isn't much compared to the program's

+	 */

+	uint16_t usedNodes = 1;

+	for (size_t i = 0; i < sizeof(keywords) / sizeof(*keywords); i++) {

+		uint16_t nodeID = 0;

+		/* Walk the dictionary, creating intermediate nodes for the keyword */

+		for (char const *ptr = keywords[i].name; *ptr; ptr++) {

+			/* We should be able to assume all entries are well-formed */

+			if (keywordDict[nodeID].children[*ptr - ' '] == 0) {

+				/*

+				 * If this gets tripped up, set the size of keywordDict to

+				 * something high, compile with `-DPRINT_NODE_COUNT` (see below),

+				 * and set the size to that.

+				 */

+				assert(usedNodes < sizeof(keywordDict) / sizeof(*keywordDict));

+				/* There is no node at that location, grab one from the pool */

+				keywordDict[nodeID].children[*ptr - ' '] = usedNodes;

+				usedNodes++;

+			}

+			nodeID = keywordDict[nodeID].children[*ptr - ' '];

+		}

+		/* This assumes that no two keywords have the same name */

+		keywordDict[nodeID].keyword = &keywords[i];

+	}

+#ifdef PRINT_NODE_COUNT /* For the maintainer to check how many nodes are needed */

+	printf("Lexer keyword dictionary: %zu keywords in %u nodes (pool size %zu)\n",

+	       sizeof(keywords) / sizeof(*keywords), usedNodes,

+	       sizeof(keywordDict) / sizeof(*keywordDict));

+#endif

+}

 void lexer_SetMode(enum LexerMode mode)

 	lexerState->mode = mode;

@@ -187,7 +463,16 @@

 		if (lexerState->offset + distance >= lexerState->size)

 			return EOF;

+		/*

+		 * Note: the following block is also duplicated for the non-mmap() path. This sucks.

+		 * However, due to subtle handling differences, I haven't found a clean way to

+		 * avoid that duplication. If you have any ideas, please discuss them in an issue or

+		 * pull request. Thank you!

+		 */

+		/* Do not perform expansions while capturing */

 		if (!lexerState->capturing) {

+			/* Scan the newly-inserted chars for any macro args */

 			bool escaped = false;

 			while (lexerState->nbChars < distance && !escaped) {

@@ -204,7 +489,7 @@

-		return lexerState->ptr[lexerState->offset + distance];

+		return (unsigned char)lexerState->ptr[lexerState->offset + distance];

 	if (lexerState->nbChars <= distance) {

@@ -240,7 +525,7 @@

 		/* Do not perform expansions when capturing */

 		if (!lexerState->capturing) {

-			/* Scan the newly-inserted chars for any expansions */

+			/* Scan the newly-inserted chars for any macro args */

 			bool escaped = false;

 			size_t index = (lexerState->index + lexerState->nbChars) % LEXER_BUF_SIZE;

@@ -276,7 +561,7 @@

 		if (lexerState->nbChars <= distance)

 			return EOF;

-	return lexerState->buf[(lexerState->index + distance) % LEXER_BUF_SIZE];

+	return (unsigned char)lexerState->buf[(lexerState->index + distance) % LEXER_BUF_SIZE];

 static void shiftChars(uint8_t distance)

@@ -320,7 +605,7 @@

 char const *lexer_GetFileName(void)

-	return lexerState->path;

+	return lexerState ? lexerState->path : NULL;

 uint32_t lexer_GetLineNo(void)

@@ -338,6 +623,457 @@

 	/* TODO */

+/* Function to discard all of a line's comments */

+static void discardComment(void)

+{

+	for (;;) {

+		int c = peek(0);

+		if (c == EOF || c == '\r' || c == '\n')

+			break;

+		shiftChars(1);

+	}

+}

+/* Functions to lex numbers of various radixes */

+static void readNumber(int radix, int32_t baseValue)

+{

+	uint32_t value = baseValue;

+	for (;;) {

+		int c = peek(0);

+		if (c < '0' || c > '0' + radix - 1)

+			break;

+		if (value > UINT32_MAX / radix)

+			warning(WARNING_LARGE_CONSTANT, "Integer constant is too large\n");

+		value = value * radix + (c - '0');

+		shiftChars(1);

+	}

+	yylval.nConstValue = value;

+}

+static void readFractionalPart(void)

+{

+	uint32_t value = 0, divisor = 1;

+	for (;;) {

+		int c = peek(0);

+		if (c < '0' || c > '9')

+			break;

+		if (divisor > UINT32_MAX / 10) {

+			warning(WARNING_LARGE_CONSTANT,

+				"Precision of fixed-point constant is too large\n");

+			/* Discard any additional digits */

+			while (c = peek(0), c >= '0' && c <= '9')

+				shiftChars(1);

+			break;

+		}

+	}

+	if (yylval.nConstValue > INT16_MAX || yylval.nConstValue < INT16_MIN)

+		warning(WARNING_LARGE_CONSTANT, "Magnitude of fixed-point constant is too large\n");

+	/* Cast to unsigned avoids UB if shifting discards bits */

+	yylval.nConstValue = (uint32_t)yylval.nConstValue << 16;

+	/* Cast to unsigned avoids undefined overflow behavior */

+	uint16_t fractional = value * 65536 / divisor;

+	yylval.nConstValue |= fractional * (yylval.nConstValue >= 0 ? 1 : -1);

+}

+static void readBinaryNumber(void)

+{

+	uint32_t value = 0;

+	for (;;) {

+		int c = peek(0);

+		/* TODO: handle `-b`'s dynamic chars */

+		if (c != '0' && c != '1')

+			break;

+		value = value * 2 + (c - '0');

+		shiftChars(1);

+	}

+	yylval.nConstValue = value;

+}

+static void readHexNumber(void)

+{

+	uint32_t value = 0;

+	bool empty = true;

+	for (;;) {

+		int c = peek(0);

+		if (c >= 'a' && c <= 'f') /* Convert letters to right after digits */

+			c = c - 'a' + 10;

+		else if (c >= 'A' && c <= 'F')

+			c = c - 'A' + 10;

+		else if (c >= '0' && c <= '9')

+			c = c - '0';

+		else

+			break;

+		if (value > UINT32_MAX / 16)

+			warning(WARNING_LARGE_CONSTANT, "Integer constant is too large\n");

+		value = value * 16 + c;

+		shiftChars(1);

+		empty = false;

+	}

+	if (empty)

+		error("Invalid integer constant, no digits after '$'\n");

+	yylval.nConstValue = value;

+}

+static void readGfxConstant(void)

+{

+	uint32_t bp0 = 0, bp1 = 0;

+	uint8_t width = 0;

+	for (;;) {

+		int c = peek(0);

+		/* TODO: handle `-g`'s dynamic chars */

+		if (c < '0' || c > '3')

+			break;

+		uint8_t pixel = c - '0';

+		if (width < 8) {

+			bp0 = bp0 << 1 | (pixel & 1);

+			bp1 = bp1 << 1 | (pixel >> 1);

+		}

+		if (width <= 8)

+			width++;

+		shiftChars(1);

+	}

+	if (width == 0)

+		error("Invalid gfx constant, no digits after '`'\n");

+	else if (width == 8)

+		warning(WARNING_LARGE_CONSTANT,

+			"Gfx constant is too large, only 8 first pixels considered\n");

+	yylval.nConstValue = bp1 << 8 | bp0;

+}

+/* Function to read identifiers & keywords */

+static int readIdentifier(char firstChar)

+{

+	/* Lex while checking for a keyword */

+	yylval.tzSym[0] = firstChar;

+	uint16_t nodeID = keywordDict[0].children[dictIndex(firstChar)];

+	int tokenType = firstChar == '.' ? T_LOCAL_ID : T_ID;

+	size_t i;

+	for (i = 1; ; i++) {

+		int c = peek(0);

+		/* If that char isn't in the symbol charset, end */

+		if ((c > '9' || c < '0')

+		 && (c > 'Z' || c < 'A')

+		 && (c > 'z' || c < 'a')

+		 && c != '#' && c != '.' && c != '@' && c != '_')

+			break;

+		shiftChars(1);

+		/* Write the char to the identifier's name */

+		if (i < sizeof(yylval.tzSym) - 1)

+			yylval.tzSym[i] = c;

+		/* If the char was a dot, mark the identifier as local */

+		if (c == '.')

+			tokenType = T_LOCAL_ID;

+		/* Attempt to traverse the tree to check for a keyword */

+		if (nodeID) /* Do nothing if matching already failed */

+			nodeID = keywordDict[nodeID].children[dictIndex(c)];

+	}

+	if (i > sizeof(yylval.tzSym) - 1) {

+		warning(WARNING_LONG_STR, "Symbol name too long, got truncated\n");

+		i = sizeof(yylval.tzSym) - 1;

+	}

+	yylval.tzSym[i] = '\0'; /* Terminate the string */

+	if (keywordDict[nodeID].keyword)

+		return keywordDict[nodeID].keyword->token;

+	return tokenType;

+}

+/* Functions to read strings */

+enum PrintType {

+	TYPE_NONE,

+	TYPE_DECIMAL,  /* d */

+	TYPE_UPPERHEX, /* X */

+	TYPE_LOWERHEX, /* x */

+	TYPE_BINARY,   /* b */

+};

+static void intToString(char *dest, size_t bufSize, struct Symbol const *sym, enum PrintType type)

+{

+	uint32_t value = sym_GetConstantSymValue(sym);

+	int fullLength;

+	/* Special cheat for binary */

+	if (type == TYPE_BINARY) {

+		char binary[33]; /* 32 bits + 1 terminator */

+		char *write_ptr = binary + 32;

+		fullLength = 0;

+		binary[32] = 0;

+		do {

+			*(--write_ptr) = (value & 1) + '0';

+			value >>= 1;

+			fullLength++;

+		} while (value);

+		strncpy(dest, write_ptr, bufSize - 1);

+	} else {

+		static char const * const formats[] = {

+			[TYPE_NONE]     = "$%" PRIX32,

+			[TYPE_DECIMAL]  = "%" PRId32,

+			[TYPE_UPPERHEX] = "%" PRIX32,

+			[TYPE_LOWERHEX] = "%" PRIx32

+		};

+		fullLength = snprintf(dest, bufSize, formats[type], value);

+		if (fullLength < 0) {

+			error("snprintf encoding error: %s\n", strerror(errno));

+			dest[0] = '\0';

+		}

+	}

+	if ((size_t)fullLength >= bufSize)

+		warning(WARNING_LONG_STR, "Interpolated symbol %s too long to fit buffer\n",

+			sym->name);

+}

+static char const *readInterpolation(void)

+{

+	char symName[MAXSYMLEN + 1];

+	size_t i = 0;

+	enum PrintType type = TYPE_NONE;

+	for (;;) {

+		int c = peek(0);

+		if (c == '{') { /* Nested interpolation */

+			shiftChars(1);

+			char const *inner = readInterpolation();

+			if (inner) {

+				while (*inner) {

+					if (i == sizeof(symName))

+						break;

+					symName[i++] = *inner++;

+				}

+			}

+		} else if (c == EOF || c == '\r' || c == '\n' || c == '"') {

+			error("Unterminated interpolation\n");

+			break;

+		} else if (c == '}') {

+			shiftChars(1);

+			break;

+		} else if (c == ':' && type == TYPE_NONE) { /* Print type, only once */

+			if (i != 1) {

+				error("Print types are exactly 1 character long\n");

+			} else {

+				switch (symName[0]) {

+				case 'b':

+					type = TYPE_BINARY;

+					break;

+				case 'd':

+					type = TYPE_DECIMAL;

+					break;

+				case 'X':

+					type = TYPE_UPPERHEX;

+					break;

+				case 'x':

+					type = TYPE_LOWERHEX;

+					break;

+				default:

+					error("Invalid print type '%s'\n", print(symName[0]));

+				}

+			}

+			i = 0; /* Now that type has been set, restart at beginning of string */

+			shiftChars(1);

+		} else {

+			if (i < sizeof(symName)) /* Allow writing an extra char to flag overflow */

+				symName[i++] = c;

+			shiftChars(1);

+		}

+	}

+	if (i == sizeof(symName)) {

+		warning(WARNING_LONG_STR, "Symbol name too long\n");

+		i--;

+	}

+	symName[i] = '\0';

+	struct Symbol const *sym = sym_FindSymbol(symName);

+	if (!sym) {

+		error("Interpolated symbol \"%s\" does not exist\n", symName);

+	} else if (sym->type == SYM_EQUS) {

+		if (type != TYPE_NONE)

+			error("Print types are only allowed for numbers\n");

+		return sym_GetStringValue(sym);

+	} else if (sym_IsNumeric(sym)) {

+		static char buf[33]; /* Worst case of 32 digits + terminator */

+		intToString(buf, sizeof(buf), sym, type);

+		return buf;

+	} else {

+		error("Only numerical and string symbols can be interpolated\n");

+	}

+	return NULL;

+}

+static void readString(void)

+{

+	size_t i = 0;

+	for (;;) {

+		int c = peek(0);

+		switch (c) {

+		case '"':

+			shiftChars(1);

+			if (i == sizeof(yylval.tzString)) {

+				i--;

+				warning(WARNING_LONG_STR, "String constant too long\n");

+			}

+			yylval.tzString[i] = '\0';

+			return;

+		case '\r':

+		case '\n': /* Do not shift these! */

+		case EOF:

+			if (i == sizeof(yylval.tzString)) {

+				i--;

+				warning(WARNING_LONG_STR, "String constant too long\n");

+			}

+			yylval.tzString[i] = '\0';

+			error("Unterminated string\n");

+			return;

+		case '\\': /* Character escape */

+			c = peek(1);

+			switch (c) {

+			case '\\': /* Return that character unchanged */

+			case '"':

+			case '{':

+			case '}':

+				shiftChars(1);

+				break;

+			case 'n':

+				c = '\n';

+				shiftChars(1);

+				break;

+			case 'r':

+				c = '\r';

+				shiftChars(1);

+				break;

+			case 't':

+				c = '\t';

+				shiftChars(1);

+				break;

+			case EOF: /* Can't really print that one */

+				error("Illegal character escape at end of input\n");

+				c = '\\';

+				break;

+			default:

+				error("Illegal character escape '%s'\n", print(c));

+				c = '\\';

+				break;

+			}

+			break;

+		case '{': /* Symbol interpolation */

+			shiftChars(1);

+			char const *ptr = readInterpolation();

+			if (ptr) {

+				while (*ptr) {

+					if (i == sizeof(yylval.tzString))

+						break;

+					yylval.tzString[i++] = *ptr++;

+				}

+			}

+			continue; /* Do not copy an additional character */

+		/* Regular characters will just get copied */

+		}

+		if (i < sizeof(yylval.tzString)) /* Copy one extra to flag overflow */

+			yylval.tzString[i++] = c;

+		shiftChars(1);

+	}

+}

+/* Function to report one character's worth of garbage bytes */

+static char const *reportGarbageChar(unsigned char firstByte)

+{

+	static char bytes[6 + 2 + 1]; /* Max size of a UTF-8 encoded code point, plus "''\0" */

+	/* First, attempt UTF-8 decoding */

+	uint32_t state = 0; /* UTF8_ACCEPT */

+	uint32_t codepoint;

+	uint8_t size = 0; /* Number of additional bytes to shift */

+	bytes[1] = firstByte; /* No need to init the rest of the array */

+	decode(&state, &codepoint, firstByte);

+	while (state != 0 && state != 1 /* UTF8_REJECT */) {

+		int c = peek(size++);

+		if (c == EOF)

+			break;

+		bytes[size + 1] = c;

+		decode(&state, &codepoint, c);

+	}

+	if (state == 0 && (codepoint > UCHAR_MAX || isprint((unsigned char)codepoint))) {

+		/* Character is valid, printable UTF-8! */

+		shiftChars(size);

+		bytes[0] = '\'';

+		bytes[size + 2] = '\'';

+		bytes[size + 3] = '\0';

+		return bytes;

+	}

+	/* The character isn't valid UTF-8, so we'll only print that first byte */

+	if (isprint(firstByte)) {

+		/* bytes[1] = firstByte; */

+		bytes[0] = '\'';

+		bytes[2] = '\'';

+		bytes[3] = '\0';

+		return bytes;

+	}

+	/* Well then, print its hex value */

+	static char const hexChars[16] = "0123456789ABCDEF";

+	bytes[0] = '0';

+	bytes[1] = 'x';

+	bytes[2] = hexChars[firstByte >> 4];

+	bytes[3] = hexChars[firstByte & 0x0f];

+	bytes[4] = '\0';

+	return bytes;

+}

+/* Lexer core */

 static int yylex_NORMAL(void)

 	for (;;) {

@@ -344,32 +1080,165 @@

 		int c = nextChar();

 		switch (c) {

-		case '\n':

-			if (lexerStateEOL) {

-				lexer_SetState(lexerStateEOL);

-				lexerStateEOL = NULL;

-			}

-			return '\n';

+		/* Ignore whitespace and comments */

-		/* Ignore whitespace */

+		case '*':

+			if (!lexerState->atLineStart)

+				return T_OP_MUL;

+			warning(WARNING_OBSOLETE,

+				"'*' is deprecated for comments, please use ';' instead\n");

+			/* fallthrough */

+		case ';':

+			discardComment();

+			/* fallthrough */

 		case ' ':

 		case '\t':

 			break;

-		/* Handle single-char tokens */

+		/* Handle unambiguous single-char tokens */

+		case '^':

+			return T_OP_XOR;

 		case '+':

 			return T_OP_ADD;

 		case '-':

 			return T_OP_SUB;

+		case '/':

+			return T_OP_DIV;

+		case '~':

+			return T_OP_NOT;

+		case '@':

+			return T_ID;

 		/* Handle accepted single chars */

 		case '[':

 		case ']':

 		case '(':

 		case ')':

 		case ',':

+		case ':':

 			return c;

+		/* Handle ambiguous 1- or 2-char tokens */

+		char secondChar;

+		case '|': /* Either binary or logical OR */

+			secondChar = peek(0);

+			if (secondChar == '|') {

+				shiftChars(1);

+				return T_OP_LOGICOR;

+			}

+			return T_OP_OR;

+		case '=': /* Either SET alias, or EQ */

+			secondChar = peek(0);

+			if (secondChar == '=') {

+				shiftChars(1);

+				return T_OP_LOGICEQU;

+			}

+			return T_POP_EQUAL;

+		case '<': /* Either a LT, LTE, or left shift */

+			secondChar = peek(0);

+			if (secondChar == '=') {

+				shiftChars(1);

+				return T_OP_LOGICLE;

+			} else if (secondChar == '<') {

+				shiftChars(1);

+				return T_OP_SHL;

+			}

+			return T_OP_LOGICLT;

+		case '>': /* Either a GT, GTE, or right shift */

+			secondChar = peek(0);

+			if (secondChar == '=') {

+				shiftChars(1);

+				return T_OP_LOGICGE;

+			} else if (secondChar == '>') {

+				shiftChars(1);

+				return T_OP_SHR;

+			}

+			return T_OP_LOGICGT;

+		case '!': /* Either a NEQ, or negation */

+			secondChar = peek(0);

+			if (secondChar == '=') {

+				shiftChars(1);

+				return T_OP_LOGICNE;

+			}

+			return T_OP_LOGICNOT;

+		/* Handle numbers */

+		case '$':

+			yylval.nConstValue = 0;

+			readHexNumber();

+			return T_NUMBER;

+		case '0': /* Decimal number */

+		case '1':

+		case '2':

+		case '3':

+		case '4':

+		case '5':

+		case '6':

+		case '7':

+		case '8':

+		case '9':

+			readNumber(10, c - '0');

+			int perhapsPeriod = peek(0);

+			if (perhapsPeriod == '.') {

+				shiftChars(1);

+				readFractionalPart();

+			}

+			return T_NUMBER;

+		case '&':

+			secondChar = peek(0);

+			if (secondChar == '&') {

+				shiftChars(1);

+				return T_OP_LOGICAND;

+			} else if (secondChar >= '0' && secondChar <= '7') {

+				readNumber(8, 0);

+				return T_NUMBER;

+			}

+			return T_OP_AND;

+		case '%': /* Either a modulo, or a binary constant */

+			secondChar = peek(0);

+			if (secondChar != '0' && secondChar != '1')

+				return T_OP_MOD;

+			yylval.nConstValue = 0;

+			readBinaryNumber();

+			return T_NUMBER;

+		case '`': /* Gfx constant */

+			readGfxConstant();

+			return T_NUMBER;

+		/* Handle strings */

+		case '"':

+			readString();

+			return T_STRING;

+		/* Handle newlines and EOF */

+		case '\r':

+			if (peek(0) == '\n')

+				shiftChars(1); /* Shift that EOL */

+			/* fallthrough */

+		case '\n':

+			if (lexerStateEOL) {

+				lexer_SetState(lexerStateEOL);

+				lexerStateEOL = NULL;

+			}

+			return '\n';

 		case EOF:

 			/* Captures end at their buffer's boundary no matter what */

 			if (!lexerState->capturing) {

@@ -377,8 +1246,31 @@

 			return 0;

+		/* Handle identifiers... or error out */

 		default:

-			error("Unknown character '%c'\n");

+			if ((c <= 'Z' && c >= 'A')

+			 || (c <= 'z' && c >= 'a')

+			 || c == '.' || c == '_') {

+				int tokenType = readIdentifier(c);

+				/* If a keyword, don't try to expand */

+				if (tokenType != T_ID && tokenType != T_LOCAL_ID)

+					return tokenType;

+				/* TODO: attempt string expansion */

+				if (tokenType == T_ID && lexerState->atLineStart)

+					return T_LABEL;

+				return tokenType;

+			}

+			/* Do not report weird characters when capturing, it'll be done later */

+			if (!lexerState->capturing) {

+				/* TODO: try to group reportings */

+				error("Unknown character %s\n", reportGarbageChar(c));

+			}

 		lexerState->atLineStart = false;

@@ -389,6 +1281,56 @@

 	fatalerror("LEXER_RAW not yet implemented\n");

+/*

+ * This function uses the fact that `if`, etc. constructs are only valid when

+ * there's nothing before them on their lines. This enables filtering

+ * "meaningful" (= at line start) vs. "meaningless" (everything else) tokens.

+ * It's especially important due to macro args not being handled in this

+ * state, and lexing them in "normal" mode potentially producing such tokens.

+ */

+static int skipIfBlock(bool toEndc)

+{

+	lexer_SetMode(LEXER_NORMAL);

+	int startingDepth = nIFDepth;

+	int token;

+	/* Prevent expanding macro args in this state by enabling capture to nothing */

+	lexerState->capturing = true;

+	lexerState->captureSize = 0;

+	lexerState->captureBuf = NULL;

+	for (;;) {

+		bool atLineStart = lexerState->atLineStart;

+		token = yylex();

+		if (token == 0) { /* Pass EOF through */

+			return token;

+		} else if (atLineStart && token == T_POP_IF) { /* Increase nesting */

+			nIFDepth++;

+		} else if (atLineStart && nIFDepth == startingDepth) { /* An occasion to finish? */

+			if (token == T_POP_ENDC || (!toEndc && (token == T_POP_ELIF

+							     || token == T_POP_ELSE)))

+				break;

+		} else if (atLineStart && token == T_POP_ENDC) { /* Decrease nesting */

+			nIFDepth--;

+		}

+	}

+	lexerState->capturing = false;

+	return token;

+}

+static int yylex_SKIP_TO_ELIF(void)

+{

+	return skipIfBlock(false);

+}

+static int yylex_SKIP_TO_ENDC(void)

+{

+	return skipIfBlock(true);

+}

 int yylex(void)

 	if (lexerState->atLineStart) {

@@ -397,21 +1339,27 @@

 	static int (* const lexerModeFuncs[])(void) = {

-		[LEXER_NORMAL] = yylex_NORMAL,

-		[LEXER_RAW]    = yylex_RAW,

+		[LEXER_NORMAL]       = yylex_NORMAL,

+		[LEXER_RAW]          = yylex_RAW,

+		[LEXER_SKIP_TO_ELIF] = yylex_SKIP_TO_ELIF,

+		[LEXER_SKIP_TO_ENDC] = yylex_SKIP_TO_ENDC

};

 	int token = lexerModeFuncs[lexerState->mode]();

+	/* Make sure to terminate files with a line feed */

+	if (token == 0 && lexerState->lastToken != '\n')

+		token = '\n';

+	lexerState->lastToken = token;

+	lexerState->atLineStart = false;

 	if (token == '\n')

 		lexerState->atLineStart = true;

-	else if (lexerState->atLineStart)

-		lexerState->atLineStart = false;

 	return token;

-void lexer_SkipToBlockEnd(int blockStartToken, int blockEndToken, int endToken,

-			  char const **capture, size_t *size, char const *name)

+void lexer_CaptureBlock(int blockStartToken, int blockEndToken, char const **capture, size_t *size,

+			char const *name)

 	lexerState->capturing = true;

 	lexerState->captureSize = 0;

@@ -418,30 +1366,19 @@

 	unsigned int level = 0;

 	char *captureStart;

-	if (capture) {

-		if (lexerState->isMmapped) {

-			captureStart = lexerState->ptr;

-		} else {

-			lexerState->captureCapacity = 128; /* The initial size will be twice that */

-			reallocCaptureBuf();

-			captureStart = lexerState->captureBuf;

-		}

+	if (lexerState->isMmapped) {

+		captureStart = lexerState->ptr;

+	} else {

+		lexerState->captureCapacity = 128; /* The initial size will be twice that */

+		reallocCaptureBuf();

+		captureStart = lexerState->captureBuf;

 	for (;;) {

 		int token = yylex();

-		if (level == 0) {

-			if (token == endToken)

-				break;

-			/*

-			 * Hack: skipping after a `if` requires stopping on three different tokens,

-			 * which there is no simple way to make this function support. Instead,

-			 * if ELIF is the end token, ELSE and ENDC are also checked for here.

-			 */

-			if (endToken == T_POP_ELIF && (token == T_POP_ELSE || token == T_POP_ENDC))

-				break;

-		}

+		if (level == 0 && token == blockEndToken)

+			break;

 		if (token == EOF)

 			error("Unterminated %s\n", name);

@@ -451,9 +1388,7 @@

 			level--;

-	if (capture) {

-		*capture = captureStart;

-		*size = lexerState->captureSize;

-	}

+	*capture = captureStart;

+	*size = lexerState->captureSize;

 	lexerState->captureBuf = NULL;

--- a/src/asm/main.c

+++ b/src/asm/main.c

@@ -488,6 +488,7 @@

 	if (!state)

 		fatalerror("Failed to open main file!\n");

+	lexer_Init();

 	lexer_SetState(state);

 	nStartClock = clock();

--- a/src/asm/symbol.c

+++ b/src/asm/symbol.c

@@ -210,8 +210,6 @@

 			labelScope = NULL;

 		hash_RemoveElement(symbols, symbol->name);

-		if (symbol->type == SYM_MACRO)

-			free(symbol->macro);

 		free(symbol);

@@ -230,8 +228,23 @@

/*

- * Return a constant symbols value

+ * Return a constant symbol's value, assuming it's defined

*/

+uint32_t sym_GetConstantSymValue(struct Symbol const *sym)

+{

+	if (sym == PCSymbol)

+		return sym_GetPCValue();

+	else if (!sym_IsConstant(sym))

+		error("\"%s\" does not have a constant value\n", sym->name);

+	else

+		return sym_GetValue(sym);

+	return 0;

+}

+/*

+ * Return a constant symbol's value

+ */

 uint32_t sym_GetConstantValue(char const *s)

 	struct Symbol const *sym = sym_FindSymbol(s);

@@ -238,12 +251,8 @@

 	if (sym == NULL)

 		error("'%s' not defined\n", s);

-	else if (sym == PCSymbol)

-		return sym_GetPCValue();

-	else if (!sym_IsConstant(sym))

-		error("\"%s\" does not have a constant value\n", s);

 	else

-		return sym_GetValue(sym);

+		return sym_GetConstantSymValue(sym);

 	return 0;

@@ -468,13 +477,13 @@

/*

  * Add a macro definition

*/

-struct Symbol *sym_AddMacro(char const *symName, int32_t defLineNo)

+struct Symbol *sym_AddMacro(char const *symName, int32_t defLineNo, char const *body, size_t size)

 	struct Symbol *sym = createNonrelocSymbol(symName);

 	sym->type = SYM_MACRO;

-	sym->macroSize = ulNewMacroSize;

-	sym->macro = tzNewMacro;

+	sym->macroSize = size;

+	sym->macro = body;

 	updateSymbolFilename(sym);

/*

 	 * The symbol is created at the line after the `endm`,

--- a/src/asm/util.c

+++ b/src/asm/util.c

@@ -6,6 +6,7 @@

  * SPDX-License-Identifier: MIT

*/

+#include <ctype.h>

 #include <stdint.h>

 #include "asm/main.h"

@@ -25,6 +26,37 @@

 		hash = (hash * 33) ^ (*s++);

 	return hash;

+}

+char const *print(char c)

+{

+	static char buf[5]; /* '\xNN' + '\0' */

+	if (isprint(c)) {

+		buf[0] = c;

+		buf[1] = '\0';

+		return buf;

+	}

+	buf[0] = '\\';

+	switch (c) {

+	case '\n':

+		buf[1] = 'n';

+		break;

+	case '\r':

+		buf[1] = 'r';

+		break;

+	case '\t':

+		buf[1] = 't';

+		break;

+	default: /* Print as hex */

+		buf[1] = 'x';

+		sprintf(&buf[2], "%02hhx", c);

+		return buf;

+	}

+	buf[2] = '\0';

+	return buf;

 size_t readUTF8Char(uint8_t *dest, char const *src)

--- a/src/asm/warning.c

+++ b/src/asm/warning.c

@@ -198,14 +198,14 @@

 	warnx("Unknown warning `%s`", flag);

-void verror(const char *fmt, va_list args, char const *flag)

+void printDiag(const char *fmt, va_list args, char const *type,

+	       char const *flagfmt, char const *flag)

-	fputs("ERROR: ", stderr);

+	fputs(type, stderr);

 	fstk_Dump();

-	fprintf(stderr, flag ? ": [-Werror=%s]\n    " : ":\n    ", flag);

+	fprintf(stderr, flagfmt, flag);

 	vfprintf(stderr, fmt, args);

 	lexer_DumpStringExpansions();

-	nbErrors++;

 void error(const char *fmt, ...)

@@ -213,8 +213,9 @@

 	va_list args;

 	va_start(args, fmt);

-	verror(fmt, args, NULL);

+	printDiag(fmt, args, "ERROR: ", "\n    ", NULL);

 	va_end(args);

+	nbErrors++;

 noreturn_ void fatalerror(const char *fmt, ...)

@@ -222,7 +223,7 @@

 	va_list args;

 	va_start(args, fmt);

-	verror(fmt, args, NULL);

+	printDiag(fmt, args, "FATAL: ", "\n    ", NULL);

 	va_end(args);

 	exit(1);

@@ -240,7 +241,7 @@

 		return;

 	case WARNING_ERROR:

-		verror(fmt, args, flag);

+		printDiag(fmt, args, "ERROR: ", "[-Werror=%s]\n    ", flag);

 		va_end(args);

 		return;

@@ -252,11 +253,7 @@

 		break;

-	fputs("warning: ", stderr);

-	fstk_Dump();

-	fprintf(stderr, ": [-W%s]\n    ", flag);

-	vfprintf(stderr, fmt, args);

-	lexer_DumpStringExpansions();

+	printDiag(fmt, args, "warning: ", "[-W%s]\n    ", flag);

 	va_end(args);

--

⑨