shithub: rgbds

Download patch

ref: df16e64fc63d7975009d90acea75bd2fa2b9eef4
parent: a4ebb878585504294b4e9d67d4f85355815a0e02
author: Rangi <remy.oukaour+rangi42@gmail.com>
date: Tue Jan 12 16:30:13 EST 2021

Handle MACRO and REPT/FOR bodies differently

Fixes #697

--- a/include/asm/lexer.h
+++ b/include/asm/lexer.h
@@ -63,12 +63,18 @@
 void lexer_SetMode(enum LexerMode mode);
 void lexer_ToggleStringExpansion(bool enable);
 
+struct CaptureBody {
+	uint32_t lineNo;
+	char *body;
+	size_t size;
+};
+
 char const *lexer_GetFileName(void);
 uint32_t lexer_GetLineNo(void);
 uint32_t lexer_GetColNo(void);
 void lexer_DumpStringExpansions(void);
 int yylex(void);
-void lexer_CaptureRept(char **capture, size_t *size);
-void lexer_CaptureMacroBody(char **capture, size_t *size);
+void lexer_CaptureRept(struct CaptureBody *capture);
+void lexer_CaptureMacroBody(struct CaptureBody *capture);
 
 #endif /* RGBDS_ASM_LEXER_H */
--- a/src/asm/fstack.c
+++ b/src/asm/fstack.c
@@ -413,9 +413,8 @@
 	memcpy(dest, macro->name, macroNameLen + 1);
 
 	newContext((struct FileStackNode *)fileInfo);
-	/* Line minus 1 because buffer begins with a newline */
 	contextStack->lexerState = lexer_OpenFileView(macro->macro, macro->macroSize,
-						      macro->fileLine - 1);
+						      macro->fileLine);
 	if (!contextStack->lexerState)
 		fatalerror("Failed to set up lexer for macro invocation\n");
 	lexer_SetStateAtEOL(contextStack->lexerState);
--- a/src/asm/lexer.c
+++ b/src/asm/lexer.c
@@ -995,10 +995,21 @@
 	lexerState->disableMacroArgs = true;
 	lexerState->disableInterpolation = true;
 	for (;;) {
-		switch (nextChar()) {
+		int c = nextChar();
+
+		switch (c) {
 		case EOF:
 			error("Unterminated block comment\n");
 			goto finish;
+		case '\r':
+			/* Handle CRLF before nextLine() since shiftChars updates colNo */
+			if (peek(0) == '\n')
+				shiftChars(1);
+			/* fallthrough */
+		case '\n':
+			if (!lexerState->expansions || lexerState->expansions->distance)
+				nextLine();
+			continue;
 		case '/':
 			if (peek(0) == '*') {
 				warning(WARNING_NESTED_COMMENT,
@@ -2194,8 +2205,10 @@
 	}
 }
 
-void lexer_CaptureRept(char **capture, size_t *size)
+void lexer_CaptureRept(struct CaptureBody *capture)
 {
+	capture->lineNo = lexer_GetLineNo();
+
 	char *captureStart = startCapture();
 	unsigned int level = 0;
 	int c;
@@ -2228,14 +2241,7 @@
 					 * We know we have read exactly "ENDR", not e.g. an EQUS
 					 */
 					lexerState->captureSize -= strlen("ENDR");
-					/* Read (but don't capture) until EOL or EOF */
-					lexerState->capturing = false;
-					do {
-						c = nextChar();
-					} while (c != EOF && c != '\r' && c != '\n');
-					/* Handle Windows CRLF */
-					if (c == '\r' && peek(0) == '\n')
-						shiftChars(1);
+					lexerState->lastToken = T_POP_ENDR; // Force EOL at EOF
 					goto finish;
 				}
 				level--;
@@ -2246,7 +2252,6 @@
 		for (;;) {
 			if (c == EOF) {
 				error("Unterminated REPT/FOR block\n");
-				lexerState->capturing = false;
 				goto finish;
 			} else if (c == '\n' || c == '\r') {
 				if (c == '\r' && peek(0) == '\n')
@@ -2258,18 +2263,21 @@
 	}
 
 finish:
-	assert(!lexerState->capturing);
-	*capture = captureStart;
-	*size = lexerState->captureSize;
+	capture->body = captureStart;
+	capture->size = lexerState->captureSize;
+	lexerState->capturing = false;
 	lexerState->captureBuf = NULL;
 	lexerState->disableMacroArgs = false;
 	lexerState->disableInterpolation = false;
+	lexerState->atLineStart = false;
 }
 
-void lexer_CaptureMacroBody(char **capture, size_t *size)
+void lexer_CaptureMacroBody(struct CaptureBody *capture)
 {
+	capture->lineNo = lexer_GetLineNo();
+
 	char *captureStart = startCapture();
-	int c = peek(0);
+	int c;
 
 	/* If the file is `mmap`ed, we need not to unmap it to keep access to the macro */
 	if (lexerState->isMmapped)
@@ -2276,58 +2284,51 @@
 		lexerState->isReferenced = true;
 
 	/*
-	 * Due to parser internals, it does not read the EOL after the T_POP_MACRO before calling
-	 * this. Thus, we need to keep one in the buffer afterwards.
-	 * (Note that this also means the captured buffer begins with a newline and maybe comment)
+	 * Due to parser internals, it reads the EOL after the expression before calling this.
+	 * Thus, we don't need to keep one in the buffer afterwards.
 	 * The following assertion checks that.
 	 */
-	assert(!lexerState->atLineStart);
+	assert(lexerState->atLineStart);
 	for (;;) {
+		nextLine();
+		/* We're at line start, so attempt to match an `ENDM` token */
+		do { /* Discard initial whitespace */
+			c = nextChar();
+		} while (isWhitespace(c));
+		/* Now, try to match `ENDM` as a **whole** identifier */
+		if (startsIdentifier(c)) {
+			switch (readIdentifier(c)) {
+			case T_POP_ENDM:
+				/*
+				 * The ENDM has been captured, but we don't want it!
+				 * We know we have read exactly "ENDM", not e.g. an EQUS
+				 */
+				lexerState->captureSize -= strlen("ENDM");
+				lexerState->lastToken = T_POP_ENDM; // Force EOL at EOF
+				goto finish;
+			}
+		}
+
 		/* Just consume characters until EOL or EOF */
 		for (;;) {
 			if (c == EOF) {
 				error("Unterminated macro definition\n");
-				lexerState->capturing = false;
 				goto finish;
-			} else if (c == '\n') {
-				break;
-			} else if (c == '\r') {
-				if (peek(0) == '\n')
+			} else if (c == '\n' || c == '\r') {
+				if (c == '\r' && peek(0) == '\n')
 					shiftChars(1);
 				break;
 			}
 			c = nextChar();
 		}
-
-		/* We're at line start, attempt to match a `label: MACRO` line or `ENDM` token */
-		do { /* Discard initial whitespace */
-			c = nextChar();
-		} while (isWhitespace(c));
-		/* Now, try to match `ENDM` as a **whole** identifier */
-		if (startsIdentifier(c)) {
-			if (readIdentifier(c) == T_POP_ENDM) {
-				/* Read (but don't capture) until EOL or EOF */
-				lexerState->capturing = false;
-				do {
-					c = peek(0);
-					if (c == EOF || c == '\r' || c == '\n')
-						break;
-					shiftChars(1);
-				} while (c != EOF && c != '\r' && c != '\n');
-				/* Handle Windows CRLF */
-				if (c == '\r' && peek(1) == '\n')
-					shiftChars(1);
-				goto finish;
-			}
-		}
-		nextLine();
 	}
 
 finish:
-	assert(!lexerState->capturing);
-	*capture = captureStart;
-	*size = lexerState->captureSize - strlen("ENDM");
+	capture->body = captureStart;
+	capture->size = lexerState->captureSize;
+	lexerState->capturing = false;
 	lexerState->captureBuf = NULL;
 	lexerState->disableMacroArgs = false;
 	lexerState->disableInterpolation = false;
+	lexerState->atLineStart = false;
 }
--- a/src/asm/parser.y
+++ b/src/asm/parser.y
@@ -36,10 +36,12 @@
 #include "linkdefs.h"
 #include "platform.h" // strncasecmp, strdup
 
-uint32_t nListCountEmpty;
-int32_t nPCOffset;
-bool executeElseBlock; /* If this is set, ELIFs cannot be executed anymore */
+int32_t nPCOffset; /* Read by rpn_Symbol */
 
+static uint32_t nListCountEmpty;
+static bool executeElseBlock; /* If this is set, ELIFs cannot be executed anymore */
+static struct CaptureBody captureBody; /* Captures a REPT/FOR or MACRO */
+
 static void upperstring(char *dest, char const *src)
 {
 	while (*src)
@@ -596,17 +598,21 @@
 		| label cpu_command T_NEWLINE
 		| label macro T_NEWLINE
 		| label simple_pseudoop T_NEWLINE
-		| pseudoop T_NEWLINE
-		| conditional /* May not necessarily be followed by a newline, see below */
+		| assignment_pseudoop T_NEWLINE
+		| entire_line /* Commands that manage newlines themselves */
 ;
 
 /*
- * For "logistical" reasons, conditionals must manage newlines themselves.
+ * For "logistical" reasons, these commands must manage newlines themselves.
  * This is because we need to switch the lexer's mode *after* the newline has been read,
  * and to avoid causing some grammar conflicts (token reducing is finicky).
  * This is DEFINITELY one of the more FRAGILE parts of the codebase, handle with care.
  */
-conditional	: if
+entire_line	: macrodef
+		| rept
+		| for
+		| break
+		| if
 		/* It's important that all of these require being at line start for `skipIfBlock` */
 		| elif
 		| else
@@ -699,13 +705,13 @@
 		}
 ;
 
-pseudoop	: equ
+/* These commands start with a T_LABEL. */
+assignment_pseudoop	: equ
 		| set
 		| rb
 		| rw
 		| rl
 		| equs
-		| macrodef
 ;
 
 simple_pseudoop : include
@@ -733,10 +739,7 @@
 		| pushc
 		| popc
 		| load
-		| rept
-		| for
 		| shift
-		| break
 		| fail
 		| warn
 		| assert
@@ -851,21 +854,18 @@
 		| T_POP_ENDL	{ out_EndLoadSection(); }
 ;
 
-rept		: T_POP_REPT uconst {
-			uint32_t nDefinitionLineNo = lexer_GetLineNo();
-			char *body;
-			size_t size;
-			lexer_CaptureRept(&body, &size);
-			fstk_RunRept($2, nDefinitionLineNo, body, size);
+rept		: T_POP_REPT uconst T_NEWLINE {
+			lexer_CaptureRept(&captureBody);
+		} T_NEWLINE {
+			fstk_RunRept($2, captureBody.lineNo, captureBody.body, captureBody.size);
 		}
 ;
 
-for		: T_POP_FOR T_ID T_COMMA for_args {
-			uint32_t nDefinitionLineNo = lexer_GetLineNo();
-			char *body;
-			size_t size;
-			lexer_CaptureRept(&body, &size);
-			fstk_RunFor($2, $4.start, $4.stop, $4.step, nDefinitionLineNo, body, size);
+for		: T_POP_FOR T_ID T_COMMA for_args T_NEWLINE {
+			lexer_CaptureRept(&captureBody);
+		} T_NEWLINE {
+			fstk_RunFor($2, $4.start, $4.stop, $4.step, captureBody.lineNo,
+				    captureBody.body, captureBody.size);
 		}
 
 for_args	: const {
@@ -885,18 +885,16 @@
 		}
 ;
 
-break		: T_POP_BREAK		{
+break		: T_POP_BREAK T_NEWLINE {
 			if (fstk_Break())
 				lexer_SetMode(LEXER_SKIP_TO_ENDR);
 		}
 ;
 
-macrodef	: T_LABEL T_COLON T_POP_MACRO {
-			int32_t nDefinitionLineNo = lexer_GetLineNo();
-			char *body;
-			size_t size;
-			lexer_CaptureMacroBody(&body, &size);
-			sym_AddMacro($1, nDefinitionLineNo, body, size);
+macrodef	: T_LABEL T_COLON T_POP_MACRO T_NEWLINE {
+			lexer_CaptureMacroBody(&captureBody);
+		} T_NEWLINE {
+			sym_AddMacro($1, captureBody.lineNo, captureBody.body, captureBody.size);
 		}
 ;
 
--- a/test/asm/break.err
+++ b/test/asm/break.err
@@ -2,5 +2,5 @@
     done 5
 warning: break.asm(17): [-Wuser]
     OK
-FATAL: break.asm(18) -> break.asm::REPT~1(23):
+FATAL: break.asm(18) -> break.asm::REPT~1(22):
     Ended block with 1 unterminated IF construct
--- a/test/asm/invalid-utf-8.asm
+++ b/test/asm/invalid-utf-8.asm
@@ -1,5 +1,6 @@
 ; This test tries to pass invalid UTF-8 through a macro argument
 ; to exercise the lexer's reportGarbageChar
-m:MACRO \1
+m:MACRO
+	\1
 ENDM
 	m ��
--- a/test/asm/invalid-utf-8.err
+++ b/test/asm/invalid-utf-8.err
@@ -1,5 +1,5 @@
-ERROR: invalid-utf-8.asm(4) -> invalid-utf-8.asm::m(3):
+ERROR: invalid-utf-8.asm(6) -> invalid-utf-8.asm::m(4):
     Unknown character 0xCF
-ERROR: invalid-utf-8.asm(4) -> invalid-utf-8.asm::m(3):
+ERROR: invalid-utf-8.asm(6) -> invalid-utf-8.asm::m(4):
     Unknown character 0xD3
 error: Assembly aborted (2 errors)!
--- a/test/asm/nested-macrodef.err
+++ b/test/asm/nested-macrodef.err
@@ -1,5 +1,7 @@
 warning: nested-macrodef.asm(26) -> nested-macrodef.asm::outer(22): [-Wuser]
     Nested macros shouldn't work, whose argument would be \1?
-ERROR: nested-macrodef.asm(26) -> nested-macrodef.asm::outer(25):
+ERROR: nested-macrodef.asm(26) -> nested-macrodef.asm::outer(24):
     Unterminated macro definition
-error: Assembly aborted (1 errors)!
+ERROR: nested-macrodef.asm(27):
+    syntax error, unexpected identifier, expecting newline
+error: Assembly aborted (2 errors)!
--- /dev/null
+++ b/test/asm/nested-macrodef.simple.err
@@ -1,0 +1,7 @@
+warning: nested-macrodef.asm(26) -> nested-macrodef.asm::outer(22): [-Wuser]
+    Nested macros shouldn't work, whose argument would be \1?
+ERROR: nested-macrodef.asm(26) -> nested-macrodef.asm::outer(24):
+    Unterminated macro definition
+ERROR: nested-macrodef.asm(27):
+    syntax error
+error: Assembly aborted (2 errors)!