shithub: rgbds

--- a/include/asm/lexer.h

+++ b/include/asm/lexer.h

@@ -63,12 +63,18 @@

 void lexer_SetMode(enum LexerMode mode);

 void lexer_ToggleStringExpansion(bool enable);

+struct CaptureBody {

+	uint32_t lineNo;

+	char *body;

+	size_t size;

+};

 char const *lexer_GetFileName(void);

 uint32_t lexer_GetLineNo(void);

 uint32_t lexer_GetColNo(void);

 void lexer_DumpStringExpansions(void);

 int yylex(void);

-void lexer_CaptureRept(char **capture, size_t *size);

-void lexer_CaptureMacroBody(char **capture, size_t *size);

+void lexer_CaptureRept(struct CaptureBody *capture);

+void lexer_CaptureMacroBody(struct CaptureBody *capture);

 #endif /* RGBDS_ASM_LEXER_H */

--- a/src/asm/fstack.c

+++ b/src/asm/fstack.c

@@ -413,9 +413,8 @@

 	memcpy(dest, macro->name, macroNameLen + 1);

 	newContext((struct FileStackNode *)fileInfo);

-	/* Line minus 1 because buffer begins with a newline */

 	contextStack->lexerState = lexer_OpenFileView(macro->macro, macro->macroSize,

-						      macro->fileLine - 1);

+						      macro->fileLine);

 	if (!contextStack->lexerState)

 		fatalerror("Failed to set up lexer for macro invocation\n");

 	lexer_SetStateAtEOL(contextStack->lexerState);

--- a/src/asm/lexer.c

+++ b/src/asm/lexer.c

@@ -995,10 +995,21 @@

 	lexerState->disableMacroArgs = true;

 	lexerState->disableInterpolation = true;

 	for (;;) {

-		switch (nextChar()) {

+		int c = nextChar();

+		switch (c) {

 		case EOF:

 			error("Unterminated block comment\n");

 			goto finish;

+		case '\r':

+			/* Handle CRLF before nextLine() since shiftChars updates colNo */

+			if (peek(0) == '\n')

+				shiftChars(1);

+			/* fallthrough */

+		case '\n':

+			if (!lexerState->expansions || lexerState->expansions->distance)

+				nextLine();

+			continue;

 		case '/':

 			if (peek(0) == '*') {

 				warning(WARNING_NESTED_COMMENT,

@@ -2194,8 +2205,10 @@

-void lexer_CaptureRept(char **capture, size_t *size)

+void lexer_CaptureRept(struct CaptureBody *capture)

+	capture->lineNo = lexer_GetLineNo();

 	char *captureStart = startCapture();

 	unsigned int level = 0;

 	int c;

@@ -2228,14 +2241,7 @@

 					 * We know we have read exactly "ENDR", not e.g. an EQUS

*/

 					lexerState->captureSize -= strlen("ENDR");

-					/* Read (but don't capture) until EOL or EOF */

-					lexerState->capturing = false;

-					do {

-						c = nextChar();

-					} while (c != EOF && c != '\r' && c != '\n');

-					/* Handle Windows CRLF */

-					if (c == '\r' && peek(0) == '\n')

-						shiftChars(1);

+					lexerState->lastToken = T_POP_ENDR; // Force EOL at EOF

 					goto finish;

 				level--;

@@ -2246,7 +2252,6 @@

 		for (;;) {

 			if (c == EOF) {

 				error("Unterminated REPT/FOR block\n");

-				lexerState->capturing = false;

 				goto finish;

 			} else if (c == '\n' || c == '\r') {

 				if (c == '\r' && peek(0) == '\n')

@@ -2258,18 +2263,21 @@

 finish:

-	assert(!lexerState->capturing);

-	*capture = captureStart;

-	*size = lexerState->captureSize;

+	capture->body = captureStart;

+	capture->size = lexerState->captureSize;

+	lexerState->capturing = false;

 	lexerState->captureBuf = NULL;

 	lexerState->disableMacroArgs = false;

 	lexerState->disableInterpolation = false;

+	lexerState->atLineStart = false;

-void lexer_CaptureMacroBody(char **capture, size_t *size)

+void lexer_CaptureMacroBody(struct CaptureBody *capture)

+	capture->lineNo = lexer_GetLineNo();

 	char *captureStart = startCapture();

-	int c = peek(0);

+	int c;

 	/* If the file is `mmap`ed, we need not to unmap it to keep access to the macro */

 	if (lexerState->isMmapped)

@@ -2276,58 +2284,51 @@

 		lexerState->isReferenced = true;

/*

-	 * Due to parser internals, it does not read the EOL after the T_POP_MACRO before calling

-	 * this. Thus, we need to keep one in the buffer afterwards.

-	 * (Note that this also means the captured buffer begins with a newline and maybe comment)

+	 * Due to parser internals, it reads the EOL after the expression before calling this.

+	 * Thus, we don't need to keep one in the buffer afterwards.

 	 * The following assertion checks that.

*/

-	assert(!lexerState->atLineStart);

+	assert(lexerState->atLineStart);

 	for (;;) {

+		nextLine();

+		/* We're at line start, so attempt to match an `ENDM` token */

+		do { /* Discard initial whitespace */

+			c = nextChar();

+		} while (isWhitespace(c));

+		/* Now, try to match `ENDM` as a **whole** identifier */

+		if (startsIdentifier(c)) {

+			switch (readIdentifier(c)) {

+			case T_POP_ENDM:

+				/*

+				 * The ENDM has been captured, but we don't want it!

+				 * We know we have read exactly "ENDM", not e.g. an EQUS

+				 */

+				lexerState->captureSize -= strlen("ENDM");

+				lexerState->lastToken = T_POP_ENDM; // Force EOL at EOF

+				goto finish;

+			}

+		}

 		/* Just consume characters until EOL or EOF */

 		for (;;) {

 			if (c == EOF) {

 				error("Unterminated macro definition\n");

-				lexerState->capturing = false;

 				goto finish;

-			} else if (c == '\n') {

-				break;

-			} else if (c == '\r') {

-				if (peek(0) == '\n')

+			} else if (c == '\n' || c == '\r') {

+				if (c == '\r' && peek(0) == '\n')

 					shiftChars(1);

 				break;

 			c = nextChar();

-		/* We're at line start, attempt to match a `label: MACRO` line or `ENDM` token */

-		do { /* Discard initial whitespace */

-			c = nextChar();

-		} while (isWhitespace(c));

-		/* Now, try to match `ENDM` as a **whole** identifier */

-		if (startsIdentifier(c)) {

-			if (readIdentifier(c) == T_POP_ENDM) {

-				/* Read (but don't capture) until EOL or EOF */

-				lexerState->capturing = false;

-				do {

-					c = peek(0);

-					if (c == EOF || c == '\r' || c == '\n')

-						break;

-					shiftChars(1);

-				} while (c != EOF && c != '\r' && c != '\n');

-				/* Handle Windows CRLF */

-				if (c == '\r' && peek(1) == '\n')

-					shiftChars(1);

-				goto finish;

-			}

-		}

-		nextLine();

 finish:

-	assert(!lexerState->capturing);

-	*capture = captureStart;

-	*size = lexerState->captureSize - strlen("ENDM");

+	capture->body = captureStart;

+	capture->size = lexerState->captureSize;

+	lexerState->capturing = false;

 	lexerState->captureBuf = NULL;

 	lexerState->disableMacroArgs = false;

 	lexerState->disableInterpolation = false;

+	lexerState->atLineStart = false;

--- a/src/asm/parser.y

+++ b/src/asm/parser.y

@@ -36,10 +36,12 @@

 #include "linkdefs.h"

 #include "platform.h" // strncasecmp, strdup

-uint32_t nListCountEmpty;

-int32_t nPCOffset;

-bool executeElseBlock; /* If this is set, ELIFs cannot be executed anymore */

+int32_t nPCOffset; /* Read by rpn_Symbol */

+static uint32_t nListCountEmpty;

+static bool executeElseBlock; /* If this is set, ELIFs cannot be executed anymore */

+static struct CaptureBody captureBody; /* Captures a REPT/FOR or MACRO */

 static void upperstring(char *dest, char const *src)

 	while (*src)

@@ -596,17 +598,21 @@

 		| label cpu_command T_NEWLINE

 		| label macro T_NEWLINE

 		| label simple_pseudoop T_NEWLINE

-		| pseudoop T_NEWLINE

-		| conditional /* May not necessarily be followed by a newline, see below */

+		| assignment_pseudoop T_NEWLINE

+		| entire_line /* Commands that manage newlines themselves */

/*

- * For "logistical" reasons, conditionals must manage newlines themselves.

+ * For "logistical" reasons, these commands must manage newlines themselves.

  * This is because we need to switch the lexer's mode *after* the newline has been read,

  * and to avoid causing some grammar conflicts (token reducing is finicky).

  * This is DEFINITELY one of the more FRAGILE parts of the codebase, handle with care.

*/

-conditional	: if

+entire_line	: macrodef

+		| rept

+		| for

+		| break

+		| if

 		/* It's important that all of these require being at line start for `skipIfBlock` */

 		| elif

 		| else

@@ -699,13 +705,13 @@

-pseudoop	: equ

+/* These commands start with a T_LABEL. */

+assignment_pseudoop	: equ

 		| set

 		| rb

 		| rw

 		| rl

 		| equs

-		| macrodef

 simple_pseudoop : include

@@ -733,10 +739,7 @@

 		| pushc

 		| popc

 		| load

-		| rept

-		| for

 		| shift

-		| break

 		| fail

 		| warn

 		| assert

@@ -851,21 +854,18 @@

 		| T_POP_ENDL	{ out_EndLoadSection(); }

-rept		: T_POP_REPT uconst {

-			uint32_t nDefinitionLineNo = lexer_GetLineNo();

-			char *body;

-			size_t size;

-			lexer_CaptureRept(&body, &size);

-			fstk_RunRept($2, nDefinitionLineNo, body, size);

+rept		: T_POP_REPT uconst T_NEWLINE {

+			lexer_CaptureRept(&captureBody);

+		} T_NEWLINE {

+			fstk_RunRept($2, captureBody.lineNo, captureBody.body, captureBody.size);

-for		: T_POP_FOR T_ID T_COMMA for_args {

-			uint32_t nDefinitionLineNo = lexer_GetLineNo();

-			char *body;

-			size_t size;

-			lexer_CaptureRept(&body, &size);

-			fstk_RunFor($2, $4.start, $4.stop, $4.step, nDefinitionLineNo, body, size);

+for		: T_POP_FOR T_ID T_COMMA for_args T_NEWLINE {

+			lexer_CaptureRept(&captureBody);

+		} T_NEWLINE {

+			fstk_RunFor($2, $4.start, $4.stop, $4.step, captureBody.lineNo,

+				    captureBody.body, captureBody.size);

 for_args	: const {

@@ -885,18 +885,16 @@

-break		: T_POP_BREAK		{

+break		: T_POP_BREAK T_NEWLINE {

 			if (fstk_Break())

 				lexer_SetMode(LEXER_SKIP_TO_ENDR);

-macrodef	: T_LABEL T_COLON T_POP_MACRO {

-			int32_t nDefinitionLineNo = lexer_GetLineNo();

-			char *body;

-			size_t size;

-			lexer_CaptureMacroBody(&body, &size);

-			sym_AddMacro($1, nDefinitionLineNo, body, size);

+macrodef	: T_LABEL T_COLON T_POP_MACRO T_NEWLINE {

+			lexer_CaptureMacroBody(&captureBody);

+		} T_NEWLINE {

+			sym_AddMacro($1, captureBody.lineNo, captureBody.body, captureBody.size);

--- a/test/asm/break.err

+++ b/test/asm/break.err

@@ -2,5 +2,5 @@

     done 5

 warning: break.asm(17): [-Wuser]

OK

-FATAL: break.asm(18) -> break.asm::REPT~1(23):

+FATAL: break.asm(18) -> break.asm::REPT~1(22):

     Ended block with 1 unterminated IF construct

--- a/test/asm/invalid-utf-8.asm

+++ b/test/asm/invalid-utf-8.asm

@@ -1,5 +1,6 @@

 ; This test tries to pass invalid UTF-8 through a macro argument

 ; to exercise the lexer's reportGarbageChar

-m:MACRO \1

+m:MACRO

+	\1

 ENDM

 	m ��

--- a/test/asm/invalid-utf-8.err

+++ b/test/asm/invalid-utf-8.err

@@ -1,5 +1,5 @@

-ERROR: invalid-utf-8.asm(4) -> invalid-utf-8.asm::m(3):

+ERROR: invalid-utf-8.asm(6) -> invalid-utf-8.asm::m(4):

     Unknown character 0xCF

-ERROR: invalid-utf-8.asm(4) -> invalid-utf-8.asm::m(3):

+ERROR: invalid-utf-8.asm(6) -> invalid-utf-8.asm::m(4):

     Unknown character 0xD3

 error: Assembly aborted (2 errors)!

--- a/test/asm/nested-macrodef.err

+++ b/test/asm/nested-macrodef.err

@@ -1,5 +1,7 @@

 warning: nested-macrodef.asm(26) -> nested-macrodef.asm::outer(22): [-Wuser]

     Nested macros shouldn't work, whose argument would be \1?

-ERROR: nested-macrodef.asm(26) -> nested-macrodef.asm::outer(25):

+ERROR: nested-macrodef.asm(26) -> nested-macrodef.asm::outer(24):

     Unterminated macro definition

-error: Assembly aborted (1 errors)!

+ERROR: nested-macrodef.asm(27):

+    syntax error, unexpected identifier, expecting newline

+error: Assembly aborted (2 errors)!

--- /dev/null

+++ b/test/asm/nested-macrodef.simple.err

@@ -1,0 +1,7 @@

+warning: nested-macrodef.asm(26) -> nested-macrodef.asm::outer(22): [-Wuser]

+    Nested macros shouldn't work, whose argument would be \1?

+ERROR: nested-macrodef.asm(26) -> nested-macrodef.asm::outer(24):

+    Unterminated macro definition

+ERROR: nested-macrodef.asm(27):

+    syntax error

+error: Assembly aborted (2 errors)!

--

⑨