ref: c35cb6ac32b703662456226b0c40b6669b54cdc0
dir: /src/asm/lexer.c/
/*
* This file is part of RGBDS.
*
* Copyright (c) 2020, Eldred Habert and RGBDS contributors.
*
* SPDX-License-Identifier: MIT
*/
#include <sys/types.h>
#include <sys/stat.h>
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <math.h>
#include <limits.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifndef _MSC_VER
#include <unistd.h>
#endif
#include "platform.h" // For `ssize_t`
#include "asm/lexer.h"
#include "asm/fixpoint.h"
#include "asm/format.h"
#include "asm/fstack.h"
#include "asm/macro.h"
#include "asm/main.h"
#include "asm/rpn.h"
#include "asm/symbol.h"
#include "asm/util.h"
#include "asm/warning.h"
// Include this last so it gets all type & constant definitions
#include "parser.h" // For token definitions, generated from parser.y
// Neither MSVC nor MinGW provide `mmap`
#if defined(_MSC_VER) || defined(__MINGW32__)
# define WIN32_LEAN_AND_MEAN // include less from windows.h
# include <windows.h> // target architecture
# include <fileapi.h> // CreateFileA
# include <winbase.h> // CreateFileMappingA
# include <memoryapi.h> // MapViewOfFile
# include <handleapi.h> // CloseHandle
# define MAP_FAILED NULL
# define mapFile(ptr, fd, path, size) do { \
(ptr) = MAP_FAILED; \
HANDLE file = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, \
FILE_FLAG_POSIX_SEMANTICS | FILE_FLAG_RANDOM_ACCESS, NULL); \
HANDLE mappingObj; \
\
if (file == INVALID_HANDLE_VALUE) \
break; \
mappingObj = CreateFileMappingA(file, NULL, PAGE_READONLY, 0, 0, NULL); \
if (mappingObj != INVALID_HANDLE_VALUE) \
(ptr) = MapViewOfFile(mappingObj, FILE_MAP_READ, 0, 0, 0); \
CloseHandle(mappingObj); \
CloseHandle(file); \
} while (0)
# define munmap(ptr, size) UnmapViewOfFile((ptr))
#else // defined(_MSC_VER) || defined(__MINGW32__)
# include <sys/mman.h>
# define mapFile(ptr, fd, path, size) do { \
(ptr) = mmap(NULL, (size), PROT_READ, MAP_PRIVATE, (fd), 0); \
\
if ((ptr) == MAP_FAILED && errno == ENOTSUP) { \
/*
* The implementation may not support MAP_PRIVATE; try again with MAP_SHARED
* instead, offering, I believe, weaker guarantees about external modifications to
* the file while reading it. That's still better than not opening it at all, though
*/ \
if (verbose) \
printf("mmap(%s, MAP_PRIVATE) failed, retrying with MAP_SHARED\n", path); \
(ptr) = mmap(NULL, (size), PROT_READ, MAP_SHARED, (fd), 0); \
} \
} while (0)
#endif // !( defined(_MSC_VER) || defined(__MINGW32__) )
// Identifiers that are also keywords are listed here. This ONLY applies to ones
// that would normally be matched as identifiers! Check out `yylex_NORMAL` to
// see how this is used.
// Tokens / keywords not handled here are handled in `yylex_NORMAL`'s switch.
static struct KeywordMapping {
char const *name;
int token;
} const keywords[] = {
// CAUTION when editing this: adding keywords will probably require extra nodes in the
// `keywordDict` array. If you forget to, you will probably trip up an assertion, anyways.
// Also, all entries in this array must be in uppercase for the dict to build correctly.
{"ADC", T_Z80_ADC},
{"ADD", T_Z80_ADD},
{"AND", T_Z80_AND},
{"BIT", T_Z80_BIT},
{"CALL", T_Z80_CALL},
{"CCF", T_Z80_CCF},
{"CPL", T_Z80_CPL},
{"CP", T_Z80_CP},
{"DAA", T_Z80_DAA},
{"DEC", T_Z80_DEC},
{"DI", T_Z80_DI},
{"EI", T_Z80_EI},
{"HALT", T_Z80_HALT},
{"INC", T_Z80_INC},
{"JP", T_Z80_JP},
{"JR", T_Z80_JR},
{"LD", T_Z80_LD},
{"LDI", T_Z80_LDI},
{"LDD", T_Z80_LDD},
{"LDIO", T_Z80_LDH},
{"LDH", T_Z80_LDH},
{"NOP", T_Z80_NOP},
{"OR", T_Z80_OR},
{"POP", T_Z80_POP},
{"PUSH", T_Z80_PUSH},
{"RES", T_Z80_RES},
{"RETI", T_Z80_RETI},
{"RET", T_Z80_RET},
{"RLCA", T_Z80_RLCA},
{"RLC", T_Z80_RLC},
{"RLA", T_Z80_RLA},
{"RL", T_Z80_RL},
{"RRC", T_Z80_RRC},
{"RRCA", T_Z80_RRCA},
{"RRA", T_Z80_RRA},
{"RR", T_Z80_RR},
{"RST", T_Z80_RST},
{"SBC", T_Z80_SBC},
{"SCF", T_Z80_SCF},
{"SET", T_Z80_SET},
{"SLA", T_Z80_SLA},
{"SRA", T_Z80_SRA},
{"SRL", T_Z80_SRL},
{"STOP", T_Z80_STOP},
{"SUB", T_Z80_SUB},
{"SWAP", T_Z80_SWAP},
{"XOR", T_Z80_XOR},
{"NZ", T_CC_NZ},
{"Z", T_CC_Z},
{"NC", T_CC_NC},
// Handled after as T_TOKEN_C
// { "C", T_CC_C },
{"AF", T_MODE_AF},
{"BC", T_MODE_BC},
{"DE", T_MODE_DE},
{"HL", T_MODE_HL},
{"SP", T_MODE_SP},
{"HLD", T_MODE_HL_DEC},
{"HLI", T_MODE_HL_INC},
{"A", T_TOKEN_A},
{"B", T_TOKEN_B},
{"C", T_TOKEN_C},
{"D", T_TOKEN_D},
{"E", T_TOKEN_E},
{"H", T_TOKEN_H},
{"L", T_TOKEN_L},
{"DEF", T_OP_DEF},
{"FRAGMENT", T_POP_FRAGMENT},
{"BANK", T_OP_BANK},
{"ALIGN", T_OP_ALIGN},
{"SIZEOF", T_OP_SIZEOF},
{"STARTOF", T_OP_STARTOF},
{"ROUND", T_OP_ROUND},
{"CEIL", T_OP_CEIL},
{"FLOOR", T_OP_FLOOR},
{"DIV", T_OP_FDIV},
{"MUL", T_OP_FMUL},
{"FMOD", T_OP_FMOD},
{"POW", T_OP_POW},
{"LOG", T_OP_LOG},
{"SIN", T_OP_SIN},
{"COS", T_OP_COS},
{"TAN", T_OP_TAN},
{"ASIN", T_OP_ASIN},
{"ACOS", T_OP_ACOS},
{"ATAN", T_OP_ATAN},
{"ATAN2", T_OP_ATAN2},
{"HIGH", T_OP_HIGH},
{"LOW", T_OP_LOW},
{"ISCONST", T_OP_ISCONST},
{"STRCMP", T_OP_STRCMP},
{"STRIN", T_OP_STRIN},
{"STRRIN", T_OP_STRRIN},
{"STRSUB", T_OP_STRSUB},
{"STRLEN", T_OP_STRLEN},
{"STRCAT", T_OP_STRCAT},
{"STRUPR", T_OP_STRUPR},
{"STRLWR", T_OP_STRLWR},
{"STRRPL", T_OP_STRRPL},
{"STRFMT", T_OP_STRFMT},
{"CHARLEN", T_OP_CHARLEN},
{"CHARSUB", T_OP_CHARSUB},
{"INCLUDE", T_POP_INCLUDE},
{"PRINT", T_POP_PRINT},
{"PRINTLN", T_POP_PRINTLN},
{"EXPORT", T_POP_EXPORT},
{"DS", T_POP_DS},
{"DB", T_POP_DB},
{"DW", T_POP_DW},
{"DL", T_POP_DL},
{"SECTION", T_POP_SECTION},
{"PURGE", T_POP_PURGE},
{"RSRESET", T_POP_RSRESET},
{"RSSET", T_POP_RSSET},
{"INCBIN", T_POP_INCBIN},
{"CHARMAP", T_POP_CHARMAP},
{"NEWCHARMAP", T_POP_NEWCHARMAP},
{"SETCHARMAP", T_POP_SETCHARMAP},
{"PUSHC", T_POP_PUSHC},
{"POPC", T_POP_POPC},
{"FAIL", T_POP_FAIL},
{"WARN", T_POP_WARN},
{"FATAL", T_POP_FATAL},
{"ASSERT", T_POP_ASSERT},
{"STATIC_ASSERT", T_POP_STATIC_ASSERT},
{"MACRO", T_POP_MACRO},
{"ENDM", T_POP_ENDM},
{"SHIFT", T_POP_SHIFT},
{"REPT", T_POP_REPT},
{"FOR", T_POP_FOR},
{"ENDR", T_POP_ENDR},
{"BREAK", T_POP_BREAK},
{"LOAD", T_POP_LOAD},
{"ENDL", T_POP_ENDL},
{"IF", T_POP_IF},
{"ELSE", T_POP_ELSE},
{"ELIF", T_POP_ELIF},
{"ENDC", T_POP_ENDC},
{"UNION", T_POP_UNION},
{"NEXTU", T_POP_NEXTU},
{"ENDU", T_POP_ENDU},
{"WRAM0", T_SECT_WRAM0},
{"VRAM", T_SECT_VRAM},
{"ROMX", T_SECT_ROMX},
{"ROM0", T_SECT_ROM0},
{"HRAM", T_SECT_HRAM},
{"WRAMX", T_SECT_WRAMX},
{"SRAM", T_SECT_SRAM},
{"OAM", T_SECT_OAM},
{"RB", T_POP_RB},
{"RW", T_POP_RW},
// Handled before as T_Z80_RL
// {"RL", T_POP_RL},
{"EQU", T_POP_EQU},
{"EQUS", T_POP_EQUS},
{"REDEF", T_POP_REDEF},
{"PUSHS", T_POP_PUSHS},
{"POPS", T_POP_POPS},
{"PUSHO", T_POP_PUSHO},
{"POPO", T_POP_POPO},
{"OPT", T_POP_OPT},
{".", T_PERIOD},
};
static bool isWhitespace(int c)
{
return c == ' ' || c == '\t';
}
#define LEXER_BUF_SIZE 42 // TODO: determine a sane value for this
// The buffer needs to be large enough for the maximum `peekInternal` lookahead distance
static_assert(LEXER_BUF_SIZE > 1, "Lexer buffer size is too small");
// This caps the size of buffer reads, and according to POSIX, passing more than SSIZE_MAX is UB
static_assert(LEXER_BUF_SIZE <= SSIZE_MAX, "Lexer buffer size is too large");
struct Expansion {
struct Expansion *parent;
char *name;
union {
char const *unowned;
char *owned;
} contents;
size_t size; // Length of the contents
size_t offset; // Cursor into the contents
bool owned; // Whether or not to free contents when this expansion is freed
};
struct IfStack {
struct IfStack *next;
bool ranIfBlock; // Whether an IF/ELIF/ELSE block ran already
bool reachedElseBlock; // Whether an ELSE block ran already
};
struct LexerState {
char const *path;
// mmap()-dependent IO state
bool isMmapped;
union {
struct { // If mmap()ed
char *ptr; // Technically `const` during the lexer's execution
size_t size;
size_t offset;
bool isReferenced; // If a macro in this file requires not unmapping it
};
struct { // Otherwise
int fd;
size_t index; // Read index into the buffer
char buf[LEXER_BUF_SIZE]; // Circular buffer
size_t nbChars; // Number of "fresh" chars in the buffer
};
};
// Common state
bool isFile;
enum LexerMode mode;
bool atLineStart;
uint32_t lineNo;
uint32_t colNo;
int lastToken;
struct IfStack *ifStack;
bool capturing; // Whether the text being lexed should be captured
size_t captureSize; // Amount of text captured
char *captureBuf; // Buffer to send the captured text to if non-NULL
size_t captureCapacity; // Size of the buffer above
bool disableMacroArgs;
bool disableInterpolation;
size_t macroArgScanDistance; // Max distance already scanned for macro args
bool expandStrings;
struct Expansion *expansions; // Points to the innermost current expansion
};
struct LexerState *lexerState = NULL;
struct LexerState *lexerStateEOL = NULL;
static void initState(struct LexerState *state)
{
state->mode = LEXER_NORMAL;
state->atLineStart = true; // yylex() will init colNo due to this
state->lastToken = T_EOF;
state->ifStack = NULL;
state->capturing = false;
state->captureBuf = NULL;
state->disableMacroArgs = false;
state->disableInterpolation = false;
state->macroArgScanDistance = 0;
state->expandStrings = true;
state->expansions = NULL;
}
static void nextLine(void)
{
lexerState->lineNo++;
lexerState->colNo = 1;
}
uint32_t lexer_GetIFDepth(void)
{
uint32_t depth = 0;
for (struct IfStack *stack = lexerState->ifStack; stack != NULL; stack = stack->next)
depth++;
return depth;
}
void lexer_IncIFDepth(void)
{
struct IfStack *ifStack = malloc(sizeof(*ifStack));
if (!ifStack)
fatalerror("Unable to allocate new IF depth: %s\n", strerror(errno));
ifStack->ranIfBlock = false;
ifStack->reachedElseBlock = false;
ifStack->next = lexerState->ifStack;
lexerState->ifStack = ifStack;
}
void lexer_DecIFDepth(void)
{
if (!lexerState->ifStack)
fatalerror("Found ENDC outside an IF construct\n");
struct IfStack *top = lexerState->ifStack->next;
free(lexerState->ifStack);
lexerState->ifStack = top;
}
bool lexer_RanIFBlock(void)
{
return lexerState->ifStack->ranIfBlock;
}
bool lexer_ReachedELSEBlock(void)
{
return lexerState->ifStack->reachedElseBlock;
}
void lexer_RunIFBlock(void)
{
lexerState->ifStack->ranIfBlock = true;
}
void lexer_ReachELSEBlock(void)
{
lexerState->ifStack->reachedElseBlock = true;
}
struct LexerState *lexer_OpenFile(char const *path)
{
bool isStdin = !strcmp(path, "-");
struct LexerState *state = malloc(sizeof(*state));
struct stat fileInfo;
// Give stdin a nicer file name
if (isStdin)
path = "<stdin>";
if (!state) {
error("Failed to allocate memory for lexer state: %s\n", strerror(errno));
return NULL;
}
if (!isStdin && stat(path, &fileInfo) != 0) {
error("Failed to stat file \"%s\": %s\n", path, strerror(errno));
free(state);
return NULL;
}
state->path = path;
state->isFile = true;
state->fd = isStdin ? STDIN_FILENO : open(path, O_RDONLY);
if (state->fd < 0) {
error("Failed to open file \"%s\": %s\n", path, strerror(errno));
free(state);
return NULL;
}
state->isMmapped = false; // By default, assume it won't be mmap()ed
if (!isStdin && fileInfo.st_size > 0) {
// Try using `mmap` for better performance
// Important: do NOT assign to `state->ptr` directly, to avoid a cast that may
// alter an eventual `MAP_FAILED` value. It would also invalidate `state->fd`,
// being on the other side of the union.
void *mappingAddr;
mapFile(mappingAddr, state->fd, state->path, fileInfo.st_size);
if (mappingAddr == MAP_FAILED) {
// If mmap()ing failed, try again using another method (below)
state->isMmapped = false;
} else {
// IMPORTANT: the `union` mandates this is accessed before other members!
close(state->fd);
state->isMmapped = true;
state->isReferenced = false; // By default, a state isn't referenced
state->ptr = mappingAddr;
assert(fileInfo.st_size >= 0);
state->size = (size_t)fileInfo.st_size;
state->offset = 0;
if (verbose)
printf("File %s successfully mmap()ped\n", path);
}
}
if (!state->isMmapped) {
// Sometimes mmap() fails or isn't available, so have a fallback
if (verbose) {
if (isStdin)
printf("Opening stdin\n");
else if (fileInfo.st_size == 0)
printf("File %s is empty\n", path);
else
printf("File %s opened as regular, errno reports \"%s\"\n",
path, strerror(errno));
}
state->index = 0;
state->nbChars = 0;
}
initState(state);
state->lineNo = 0; // Will be incremented at first line start
return state;
}
struct LexerState *lexer_OpenFileView(char const *path, char *buf, size_t size, uint32_t lineNo)
{
struct LexerState *state = malloc(sizeof(*state));
if (!state) {
error("Failed to allocate memory for lexer state: %s\n", strerror(errno));
return NULL;
}
state->path = path; // Used to report read errors in `peekInternal`
state->isFile = false;
state->isMmapped = true; // It's not *really* mmap()ed, but it behaves the same
state->ptr = buf;
state->size = size;
state->offset = 0;
initState(state);
state->lineNo = lineNo; // Will be incremented at first line start
return state;
}
void lexer_RestartRept(uint32_t lineNo)
{
lexerState->offset = 0;
initState(lexerState);
lexerState->lineNo = lineNo;
}
void lexer_DeleteState(struct LexerState *state)
{
// A big chunk of the lexer state soundness is the file stack ("fstack").
// Each context in the fstack has its own *unique* lexer state; thus, we always guarantee
// that lexer states lifetimes are always properly managed, since they're handled solely
// by the fstack... with *one* exception.
// Assume a context is pushed on top of the fstack, and the corresponding lexer state gets
// scheduled at EOF; `lexerStateEOL` thus becomes a (weak) ref to that lexer state...
// It has been possible, due to a bug, that the corresponding fstack context gets popped
// before EOL, deleting the associated state... but it would still be switched to at EOL.
// This assertion checks that this doesn't happen again.
// It could be argued that deleting a state that's scheduled for EOF could simply clear
// `lexerStateEOL`, but there's currently no situation in which this should happen.
assert(state != lexerStateEOL);
if (!state->isMmapped)
close(state->fd);
else if (state->isFile && !state->isReferenced)
munmap(state->ptr, state->size);
free(state);
}
struct KeywordDictNode {
// The identifier charset is (currently) 44 characters big. By storing entries for the
// entire printable ASCII charset, minus lower-case due to case-insensitivity,
// we only waste (0x60 - 0x20) - 70 = 20 entries per node, which should be acceptable.
// In turn, this allows greatly simplifying checking an index into this array,
// which should help speed up the lexer.
uint16_t children[0x60 - ' '];
struct KeywordMapping const *keyword;
// Since the keyword structure is invariant, the min number of nodes is known at compile time
} keywordDict[365] = {0}; // Make sure to keep this correct when adding keywords!
// Convert a char into its index into the dict
static uint8_t dictIndex(char c)
{
// Translate uppercase to lowercase (roughly)
if (c > 0x60)
c = c - ('a' - 'A');
return c - ' ';
}
void lexer_Init(void)
{
// Build the dictionary of keywords. This could be done at compile time instead, however:
// - Doing so manually is a task nobody wants to undertake
// - It would be massively hard to read
// - Doing it within CC or CPP would be quite non-trivial
// - Doing it externally would require some extra work to use only POSIX tools
// - The startup overhead isn't much compared to the program's
uint16_t usedNodes = 1;
for (size_t i = 0; i < ARRAY_SIZE(keywords); i++) {
uint16_t nodeID = 0;
// Walk the dictionary, creating intermediate nodes for the keyword
for (char const *ptr = keywords[i].name; *ptr; ptr++) {
// We should be able to assume all entries are well-formed
if (keywordDict[nodeID].children[*ptr - ' '] == 0) {
// If this gets tripped up, set the size of keywordDict to
// something high, compile with `-DPRINT_NODE_COUNT` (see below),
// and set the size to that.
assert(usedNodes < sizeof(keywordDict) / sizeof(*keywordDict));
// There is no node at that location, grab one from the pool
keywordDict[nodeID].children[*ptr - ' '] = usedNodes;
usedNodes++;
}
nodeID = keywordDict[nodeID].children[*ptr - ' '];
}
// This assumes that no two keywords have the same name
keywordDict[nodeID].keyword = &keywords[i];
}
#ifdef PRINT_NODE_COUNT // For the maintainer to check how many nodes are needed
printf("Lexer keyword dictionary: %zu keywords in %u nodes (pool size %zu)\n",
ARRAY_SIZE(keywords), usedNodes, ARRAY_SIZE(keywordDict));
#endif
}
void lexer_SetMode(enum LexerMode mode)
{
lexerState->mode = mode;
}
void lexer_ToggleStringExpansion(bool enable)
{
lexerState->expandStrings = enable;
}
// Functions for the actual lexer to obtain characters
static void reallocCaptureBuf(void)
{
if (lexerState->captureCapacity == SIZE_MAX)
fatalerror("Cannot grow capture buffer past %zu bytes\n", SIZE_MAX);
else if (lexerState->captureCapacity > SIZE_MAX / 2)
lexerState->captureCapacity = SIZE_MAX;
else
lexerState->captureCapacity *= 2;
lexerState->captureBuf = realloc(lexerState->captureBuf, lexerState->captureCapacity);
if (!lexerState->captureBuf)
fatalerror("realloc error while resizing capture buffer: %s\n", strerror(errno));
}
static void beginExpansion(char const *str, bool owned, char const *name)
{
size_t size = strlen(str);
// Do not expand empty strings
if (!size)
return;
if (name)
lexer_CheckRecursionDepth();
struct Expansion *exp = malloc(sizeof(*exp));
if (!exp)
fatalerror("Unable to allocate new expansion: %s\n", strerror(errno));
exp->parent = lexerState->expansions;
exp->name = name ? strdup(name) : NULL;
exp->contents.unowned = str;
exp->size = size;
exp->offset = 0;
exp->owned = owned;
lexerState->expansions = exp;
}
void lexer_CheckRecursionDepth(void)
{
size_t depth = 0;
for (struct Expansion *exp = lexerState->expansions; exp; exp = exp->parent) {
if (depth++ > maxRecursionDepth)
fatalerror("Recursion limit (%zu) exceeded\n", maxRecursionDepth);
}
}
static void freeExpansion(struct Expansion *expansion)
{
free(expansion->name);
if (expansion->owned)
free(expansion->contents.owned);
free(expansion);
}
static bool isMacroChar(char c)
{
return c == '@' || c == '#' || c == '<' || (c >= '0' && c <= '9');
}
// forward declarations for readBracketedMacroArgNum
static int peek(void);
static void shiftChar(void);
static uint32_t readNumber(int radix, uint32_t baseValue);
static bool startsIdentifier(int c);
static bool continuesIdentifier(int c);
static uint32_t readBracketedMacroArgNum(void)
{
bool disableMacroArgs = lexerState->disableMacroArgs;
bool disableInterpolation = lexerState->disableInterpolation;
lexerState->disableMacroArgs = false;
lexerState->disableInterpolation = false;
uint32_t num = 0;
int c = peek();
bool empty = false;
bool symbolError = false;
if (c >= '0' && c <= '9') {
num = readNumber(10, 0);
} else if (startsIdentifier(c)) {
char symName[MAXSYMLEN + 1];
size_t i = 0;
for (; continuesIdentifier(c); c = peek()) {
if (i < sizeof(symName))
symName[i++] = c;
shiftChar();
}
if (i == sizeof(symName)) {
warning(WARNING_LONG_STR, "Bracketed symbol name too long\n");
i--;
}
symName[i] = '\0';
struct Symbol const *sym = sym_FindScopedSymbol(symName);
if (!sym) {
error("Bracketed symbol \"%s\" does not exist\n", symName);
num = 0;
symbolError = true;
} else if (!sym_IsNumeric(sym)) {
error("Bracketed symbol \"%s\" is not numeric\n", symName);
num = 0;
symbolError = true;
} else {
num = sym_GetConstantSymValue(sym);
}
} else {
empty = true;
}
c = peek();
shiftChar();
if (c != '>') {
error("Invalid character in bracketed macro argument %s\n", printChar(c));
return 0;
} else if (empty) {
error("Empty bracketed macro argument\n");
return 0;
} else if (num == 0 && !symbolError) {
error("Invalid bracketed macro argument '\\<0>'\n");
return 0;
}
lexerState->disableMacroArgs = disableMacroArgs;
lexerState->disableInterpolation = disableInterpolation;
return num;
}
static char const *readMacroArg(char name)
{
char const *str = NULL;
if (name == '@') {
str = macro_GetUniqueIDStr();
} else if (name == '#') {
str = macro_GetAllArgs();
} else if (name == '<') {
uint32_t num = readBracketedMacroArgNum();
if (num == 0)
return NULL;
str = macro_GetArg(num);
if (!str)
error("Macro argument '\\<%" PRIu32 ">' not defined\n", num);
return str;
} else if (name == '0') {
error("Invalid macro argument '\\0'\n");
return NULL;
} else {
assert(name > '0' && name <= '9');
str = macro_GetArg(name - '0');
}
if (!str)
error("Macro argument '\\%c' not defined\n", name);
return str;
}
static size_t readInternal(size_t bufIndex, size_t nbChars)
{
// This buffer overflow made me lose WEEKS of my life. Never again.
assert(bufIndex + nbChars <= LEXER_BUF_SIZE);
ssize_t nbReadChars = read(lexerState->fd, &lexerState->buf[bufIndex], nbChars);
if (nbReadChars == -1)
fatalerror("Error while reading \"%s\": %s\n", lexerState->path, strerror(errno));
// `nbReadChars` cannot be negative, so it's fine to cast to `size_t`
return (size_t)nbReadChars;
}
// We only need one character of lookahead, for macro arguments
static int peekInternal(uint8_t distance)
{
for (struct Expansion *exp = lexerState->expansions; exp; exp = exp->parent) {
// An expansion that has reached its end will have `exp->offset` == `exp->size`,
// and `peekInternal` will continue with its parent
assert(exp->offset <= exp->size);
if (distance < exp->size - exp->offset)
return exp->contents.unowned[exp->offset + distance];
distance -= exp->size - exp->offset;
}
if (distance >= LEXER_BUF_SIZE)
fatalerror("Internal lexer error: buffer has insufficient size for peeking (%"
PRIu8 " >= %u)\n", distance, LEXER_BUF_SIZE);
if (lexerState->isMmapped) {
if (lexerState->offset + distance >= lexerState->size)
return EOF;
return (unsigned char)lexerState->ptr[lexerState->offset + distance];
}
if (lexerState->nbChars <= distance) {
// Buffer isn't full enough, read some chars in
size_t target = LEXER_BUF_SIZE - lexerState->nbChars; // Aim: making the buf full
// Compute the index we'll start writing to
size_t writeIndex = (lexerState->index + lexerState->nbChars) % LEXER_BUF_SIZE;
// If the range to fill passes over the buffer wrapping point, we need two reads
if (writeIndex + target > LEXER_BUF_SIZE) {
size_t nbExpectedChars = LEXER_BUF_SIZE - writeIndex;
size_t nbReadChars = readInternal(writeIndex, nbExpectedChars);
lexerState->nbChars += nbReadChars;
writeIndex += nbReadChars;
if (writeIndex == LEXER_BUF_SIZE)
writeIndex = 0;
// If the read was incomplete, don't perform a second read
target -= nbReadChars;
if (nbReadChars < nbExpectedChars)
target = 0;
}
if (target != 0)
lexerState->nbChars += readInternal(writeIndex, target);
// If there aren't enough chars even after refilling, give up
if (lexerState->nbChars <= distance)
return EOF;
}
return (unsigned char)lexerState->buf[(lexerState->index + distance) % LEXER_BUF_SIZE];
}
// forward declarations for peek
static void shiftChar(void);
static char const *readInterpolation(size_t depth);
static int peek(void)
{
int c = peekInternal(0);
if (lexerState->macroArgScanDistance > 0)
return c;
lexerState->macroArgScanDistance++; // Do not consider again
if (c == '\\' && !lexerState->disableMacroArgs) {
// If character is a backslash, check for a macro arg
lexerState->macroArgScanDistance++;
c = peekInternal(1);
if (isMacroChar(c)) {
shiftChar();
shiftChar();
char const *str = readMacroArg(c);
// If the macro arg is invalid or an empty string, it cannot be
// expanded, so skip it and keep peeking.
if (!str || !str[0])
return peek();
beginExpansion(str, c == '#', NULL);
// Assuming macro args can't be recursive (I'll be damned if a way
// is found...), then we mark the entire macro arg as scanned.
lexerState->macroArgScanDistance += strlen(str);
c = str[0];
} else {
c = '\\';
}
} else if (c == '{' && !lexerState->disableInterpolation) {
// If character is an open brace, do symbol interpolation
shiftChar();
char const *str = readInterpolation(0);
if (str && str[0])
beginExpansion(str, false, str);
return peek();
}
return c;
}
static void shiftChar(void)
{
if (lexerState->capturing) {
if (lexerState->captureBuf) {
if (lexerState->captureSize + 1 >= lexerState->captureCapacity)
reallocCaptureBuf();
// TODO: improve this?
lexerState->captureBuf[lexerState->captureSize] = peek();
}
lexerState->captureSize++;
}
lexerState->macroArgScanDistance--;
restart:
if (lexerState->expansions) {
// Advance within the current expansion
assert(lexerState->expansions->offset <= lexerState->expansions->size);
lexerState->expansions->offset++;
if (lexerState->expansions->offset > lexerState->expansions->size) {
// When advancing would go past an expansion's end, free it,
// move up to its parent, and try again to advance
struct Expansion *exp = lexerState->expansions;
lexerState->expansions = lexerState->expansions->parent;
freeExpansion(exp);
goto restart;
}
} else {
// Advance within the file contents
lexerState->colNo++;
if (lexerState->isMmapped) {
lexerState->offset++;
} else {
assert(lexerState->index < LEXER_BUF_SIZE);
lexerState->index++;
if (lexerState->index == LEXER_BUF_SIZE)
lexerState->index = 0; // Wrap around if necessary
assert(lexerState->nbChars > 0);
lexerState->nbChars--;
}
}
}
static int nextChar(void)
{
int c = peek();
// If not at EOF, advance read position
if (c != EOF)
shiftChar();
return c;
}
static void handleCRLF(int c)
{
if (c == '\r' && peek() == '\n')
shiftChar();
}
// "Services" provided by the lexer to the rest of the program
char const *lexer_GetFileName(void)
{
return lexerState ? lexerState->path : NULL;
}
uint32_t lexer_GetLineNo(void)
{
return lexerState->lineNo;
}
uint32_t lexer_GetColNo(void)
{
return lexerState->colNo;
}
void lexer_DumpStringExpansions(void)
{
if (!lexerState)
return;
for (struct Expansion *exp = lexerState->expansions; exp; exp = exp->parent) {
// Only register EQUS expansions, not string args
if (exp->name)
fprintf(stderr, "while expanding symbol \"%s\"\n", exp->name);
}
}
// Discards a block comment
static void discardBlockComment(void)
{
lexerState->disableMacroArgs = true;
lexerState->disableInterpolation = true;
for (;;) {
int c = nextChar();
switch (c) {
case EOF:
error("Unterminated block comment\n");
goto finish;
case '\r':
// Handle CRLF before nextLine() since shiftChar updates colNo
handleCRLF(c);
// fallthrough
case '\n':
if (!lexerState->expansions)
nextLine();
continue;
case '/':
if (peek() == '*') {
warning(WARNING_NESTED_COMMENT,
"/* in block comment\n");
}
continue;
case '*':
if (peek() == '/') {
shiftChar();
goto finish;
}
// fallthrough
default:
continue;
}
}
finish:
lexerState->disableMacroArgs = false;
lexerState->disableInterpolation = false;
}
// Function to discard all of a line's comments
static void discardComment(void)
{
lexerState->disableMacroArgs = true;
lexerState->disableInterpolation = true;
for (;; shiftChar()) {
int c = peek();
if (c == EOF || c == '\r' || c == '\n')
break;
}
lexerState->disableMacroArgs = false;
lexerState->disableInterpolation = false;
}
// Function to read a line continuation
static void readLineContinuation(void)
{
for (;;) {
int c = peek();
if (isWhitespace(c)) {
shiftChar();
} else if (c == '\r' || c == '\n') {
shiftChar();
// Handle CRLF before nextLine() since shiftChar updates colNo
handleCRLF(c);
if (!lexerState->expansions)
nextLine();
break;
} else if (c == ';') {
discardComment();
} else {
error("Begun line continuation, but encountered character %s\n",
printChar(c));
break;
}
}
}
// Function to read an anonymous label ref
static void readAnonLabelRef(char c)
{
uint32_t n = 0;
// We come here having already peeked at one char, so no need to do it again
do {
shiftChar();
n++;
} while (peek() == c);
sym_WriteAnonLabelName(yylval.symName, n, c == '-');
}
// Functions to lex numbers of various radixes
static uint32_t readNumber(int radix, uint32_t baseValue)
{
uint32_t value = baseValue;
for (;; shiftChar()) {
int c = peek();
if (c == '_')
continue;
else if (c < '0' || c > '0' + radix - 1)
break;
if (value > (UINT32_MAX - (c - '0')) / radix)
warning(WARNING_LARGE_CONSTANT, "Integer constant is too large\n");
value = value * radix + (c - '0');
}
return value;
}
static uint32_t readFractionalPart(uint32_t integer)
{
uint32_t value = 0, divisor = 1;
uint8_t precision = 0;
enum {
READFRACTIONALPART_DIGITS,
READFRACTIONALPART_PRECISION,
READFRACTIONALPART_PRECISION_DIGITS,
} state = READFRACTIONALPART_DIGITS;
for (;; shiftChar()) {
int c = peek();
if (state == READFRACTIONALPART_DIGITS) {
if (c == '_') {
continue;
} else if (c == 'q' || c == 'Q') {
state = READFRACTIONALPART_PRECISION;
continue;
} else if (c < '0' || c > '9') {
break;
}
if (divisor > (UINT32_MAX - (c - '0')) / 10) {
warning(WARNING_LARGE_CONSTANT,
"Precision of fixed-point constant is too large\n");
// Discard any additional digits
shiftChar();
while (c = peek(), (c >= '0' && c <= '9') || c == '_')
shiftChar();
break;
}
value = value * 10 + (c - '0');
divisor *= 10;
} else {
if (c == '.' && state == READFRACTIONALPART_PRECISION) {
state = READFRACTIONALPART_PRECISION_DIGITS;
continue;
} else if (c < '0' || c > '9') {
break;
}
precision = precision * 10 + (c - '0');
}
}
if (precision == 0) {
if (state >= READFRACTIONALPART_PRECISION)
error("Invalid fixed-point constant, no significant digits after 'q'\n");
precision = fixPrecision;
} else if (precision > 31) {
error("Fixed-point constant precision must be between 1 and 31\n");
precision = fixPrecision;
}
if (integer >= ((uint32_t)1 << (precision - 1)))
warning(WARNING_LARGE_CONSTANT, "Magnitude of fixed-point constant is too large\n");
// Cast to unsigned avoids undefined overflow behavior
uint32_t fractional = (uint32_t)round((double)value / divisor * pow(2.0, precision));
return (integer << precision) | fractional;
}
char binDigits[2];
static uint32_t readBinaryNumber(void)
{
uint32_t value = 0;
for (;; shiftChar()) {
int c = peek();
int bit;
// Check for '_' after digits in case one of the digits is '_'
if (c == binDigits[0])
bit = 0;
else if (c == binDigits[1])
bit = 1;
else if (c == '_')
continue;
else
break;
if (value > (UINT32_MAX - bit) / 2)
warning(WARNING_LARGE_CONSTANT, "Integer constant is too large\n");
value = value * 2 + bit;
}
return value;
}
static uint32_t readHexNumber(void)
{
uint32_t value = 0;
bool empty = true;
for (;; shiftChar()) {
int c = peek();
if (c >= 'a' && c <= 'f')
c = c - 'a' + 10;
else if (c >= 'A' && c <= 'F')
c = c - 'A' + 10;
else if (c >= '0' && c <= '9')
c = c - '0';
else if (c == '_' && !empty)
continue;
else
break;
if (value > (UINT32_MAX - c) / 16)
warning(WARNING_LARGE_CONSTANT, "Integer constant is too large\n");
value = value * 16 + c;
empty = false;
}
if (empty)
error("Invalid integer constant, no digits after '$'\n");
return value;
}
char gfxDigits[4];
static uint32_t readGfxConstant(void)
{
uint32_t bitPlaneLower = 0, bitPlaneUpper = 0;
uint8_t width = 0;
for (;; shiftChar()) {
int c = peek();
uint32_t pixel;
// Check for '_' after digits in case one of the digits is '_'
if (c == gfxDigits[0])
pixel = 0;
else if (c == gfxDigits[1])
pixel = 1;
else if (c == gfxDigits[2])
pixel = 2;
else if (c == gfxDigits[3])
pixel = 3;
else if (c == '_' && width > 0)
continue;
else
break;
if (width < 8) {
bitPlaneLower = bitPlaneLower << 1 | (pixel & 1);
bitPlaneUpper = bitPlaneUpper << 1 | (pixel >> 1);
}
if (width < 9)
width++;
}
if (width == 0)
error("Invalid graphics constant, no digits after '`'\n");
else if (width == 9)
warning(WARNING_LARGE_CONSTANT,
"Graphics constant is too long, only first 8 pixels considered\n");
return bitPlaneUpper << 8 | bitPlaneLower;
}
// Functions to read identifiers & keywords
static bool startsIdentifier(int c)
{
// Anonymous labels internally start with '!'
return (c <= 'Z' && c >= 'A') || (c <= 'z' && c >= 'a') || c == '.' || c == '_';
}
static bool continuesIdentifier(int c)
{
return startsIdentifier(c) || (c <= '9' && c >= '0') || c == '#' || c == '@';
}
static int readIdentifier(char firstChar)
{
// Lex while checking for a keyword
yylval.symName[0] = firstChar;
uint16_t nodeID = keywordDict[0].children[dictIndex(firstChar)];
int tokenType = firstChar == '.' ? T_LOCAL_ID : T_ID;
size_t i = 1;
// Continue reading while the char is in the symbol charset
for (int c = peek(); continuesIdentifier(c); i++, c = peek()) {
shiftChar();
if (i < sizeof(yylval.symName) - 1) {
// Write the char to the identifier's name
yylval.symName[i] = c;
// If the char was a dot, mark the identifier as local
if (c == '.')
tokenType = T_LOCAL_ID;
// Attempt to traverse the tree to check for a keyword
if (nodeID) // Do nothing if matching already failed
nodeID = keywordDict[nodeID].children[dictIndex(c)];
}
}
if (i > sizeof(yylval.symName) - 1) {
warning(WARNING_LONG_STR, "Symbol name too long, got truncated\n");
i = sizeof(yylval.symName) - 1;
}
yylval.symName[i] = '\0'; // Terminate the string
if (keywordDict[nodeID].keyword)
return keywordDict[nodeID].keyword->token;
return tokenType;
}
// Functions to read strings
static char const *readInterpolation(size_t depth)
{
if (depth > maxRecursionDepth)
fatalerror("Recursion limit (%zu) exceeded\n", maxRecursionDepth);
char symName[MAXSYMLEN + 1];
size_t i = 0;
struct FormatSpec fmt = fmt_NewSpec();
bool disableInterpolation = lexerState->disableInterpolation;
// In a context where `lexerState->disableInterpolation` is true, `peek` will expand
// nested interpolations itself, which can lead to stack overflow. This lets
// `readInterpolation` handle its own nested expansions, increasing `depth` each time.
lexerState->disableInterpolation = true;
for (;;) {
int c = peek();
if (c == '{') { // Nested interpolation
shiftChar();
char const *str = readInterpolation(depth + 1);
if (str && str[0])
beginExpansion(str, false, str);
continue; // Restart, reading from the new buffer
} else if (c == EOF || c == '\r' || c == '\n' || c == '"') {
error("Missing }\n");
break;
} else if (c == '}') {
shiftChar();
break;
} else if (c == ':' && !fmt_IsFinished(&fmt)) { // Format spec, only once
shiftChar();
for (size_t j = 0; j < i; j++)
fmt_UseCharacter(&fmt, symName[j]);
fmt_FinishCharacters(&fmt);
symName[i] = '\0';
if (!fmt_IsValid(&fmt))
error("Invalid format spec '%s'\n", symName);
i = 0; // Now that format has been set, restart at beginning of string
} else {
shiftChar();
if (i < sizeof(symName)) // Allow writing an extra char to flag overflow
symName[i++] = c;
}
}
if (i == sizeof(symName)) {
warning(WARNING_LONG_STR, "Interpolated symbol name too long\n");
i--;
}
symName[i] = '\0';
// Don't return before `lexerState->disableInterpolation` is reset!
lexerState->disableInterpolation = disableInterpolation;
static char buf[MAXSTRLEN + 1];
struct Symbol const *sym = sym_FindScopedSymbol(symName);
if (!sym) {
error("Interpolated symbol \"%s\" does not exist\n", symName);
} else if (sym->type == SYM_EQUS) {
if (fmt_IsEmpty(&fmt))
// No format was specified
fmt.type = 's';
fmt_PrintString(buf, sizeof(buf), &fmt, sym_GetStringValue(sym));
return buf;
} else if (sym_IsNumeric(sym)) {
if (fmt_IsEmpty(&fmt)) {
// No format was specified; default to uppercase $hex
fmt.type = 'X';
fmt.prefix = true;
}
fmt_PrintNumber(buf, sizeof(buf), &fmt, sym_GetConstantSymValue(sym));
return buf;
} else {
error("Only numerical and string symbols can be interpolated\n");
}
return NULL;
}
#define append_yylval_string(c) do { \
char v = (c); /* Evaluate c exactly once in case it has side effects. */ \
if (i < sizeof(yylval.string)) \
yylval.string[i++] = v; \
} while (0)
static size_t appendEscapedSubstring(char const *str, size_t i)
{
// Copy one extra to flag overflow
while (*str) {
char c = *str++;
// Escape characters that need escaping
switch (c) {
case '\\':
case '"':
case '{':
append_yylval_string('\\');
break;
case '\n':
append_yylval_string('\\');
c = 'n';
break;
case '\r':
append_yylval_string('\\');
c = 'r';
break;
case '\t':
append_yylval_string('\\');
c = 't';
break;
}
append_yylval_string(c);
}
return i;
}
static void readString(void)
{
lexerState->disableMacroArgs = true;
lexerState->disableInterpolation = true;
size_t i = 0;
bool multiline = false;
char const *str;
// We reach this function after reading a single quote, but we also support triple quotes
if (peek() == '"') {
shiftChar();
if (peek() == '"') {
// """ begins a multi-line string
shiftChar();
multiline = true;
} else {
// "" is an empty string, skip the loop
goto finish;
}
}
for (;;) {
int c = peek();
// '\r', '\n' or EOF ends a single-line string early
if (c == EOF || (!multiline && (c == '\r' || c == '\n'))) {
error("Unterminated string\n");
break;
}
// We'll be staying in the string, so we can safely consume the char
shiftChar();
// Handle '\r' or '\n' (in multiline strings only, already handled above otherwise)
if (c == '\r' || c == '\n') {
// Handle CRLF before nextLine() since shiftChar updates colNo
handleCRLF(c);
nextLine();
c = '\n';
}
switch (c) {
case '"':
if (multiline) {
// Only """ ends a multi-line string
if (peek() != '"')
break;
shiftChar();
if (peek() != '"') {
append_yylval_string('"');
break;
}
shiftChar();
}
goto finish;
case '\\': // Character escape or macro arg
c = peek();
switch (c) {
case '\\':
case '"':
case '{':
case '}':
// Return that character unchanged
shiftChar();
break;
case 'n':
c = '\n';
shiftChar();
break;
case 'r':
c = '\r';
shiftChar();
break;
case 't':
c = '\t';
shiftChar();
break;
// Line continuation
case ' ':
case '\r':
case '\n':
readLineContinuation();
continue;
// Macro arg
case '@':
case '#':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '<':
shiftChar();
str = readMacroArg(c);
if (str) {
while (*str)
append_yylval_string(*str++);
}
continue; // Do not copy an additional character
case EOF: // Can't really print that one
error("Illegal character escape at end of input\n");
c = '\\';
break;
default:
error("Illegal character escape %s\n", printChar(c));
shiftChar();
break;
}
break;
case '{': // Symbol interpolation
// We'll be exiting the string scope, so re-enable expansions
// (Not interpolations, since they're handled by the function itself...)
lexerState->disableMacroArgs = false;
str = readInterpolation(0);
if (str) {
while (*str)
append_yylval_string(*str++);
}
lexerState->disableMacroArgs = true;
continue; // Do not copy an additional character
// Regular characters will just get copied
}
append_yylval_string(c);
}
finish:
if (i == sizeof(yylval.string)) {
i--;
warning(WARNING_LONG_STR, "String constant too long\n");
}
yylval.string[i] = '\0';
lexerState->disableMacroArgs = false;
lexerState->disableInterpolation = false;
}
static size_t appendStringLiteral(size_t i)
{
lexerState->disableMacroArgs = true;
lexerState->disableInterpolation = true;
bool multiline = false;
char const *str;
// We reach this function after reading a single quote, but we also support triple quotes
append_yylval_string('"');
if (peek() == '"') {
append_yylval_string('"');
shiftChar();
if (peek() == '"') {
// """ begins a multi-line string
append_yylval_string('"');
shiftChar();
multiline = true;
} else {
// "" is an empty string, skip the loop
goto finish;
}
}
for (;;) {
int c = peek();
// '\r', '\n' or EOF ends a single-line string early
if (c == EOF || (!multiline && (c == '\r' || c == '\n'))) {
error("Unterminated string\n");
break;
}
// We'll be staying in the string, so we can safely consume the char
shiftChar();
// Handle '\r' or '\n' (in multiline strings only, already handled above otherwise)
if (c == '\r' || c == '\n') {
// Handle CRLF before nextLine() since shiftChar updates colNo
handleCRLF(c);
nextLine();
c = '\n';
}
switch (c) {
case '"':
if (multiline) {
// Only """ ends a multi-line string
if (peek() != '"')
break;
append_yylval_string('"');
shiftChar();
if (peek() != '"')
break;
append_yylval_string('"');
shiftChar();
}
append_yylval_string('"');
goto finish;
case '\\': // Character escape or macro arg
c = peek();
switch (c) {
// Character escape
case '\\':
case '"':
case '{':
case '}':
case 'n':
case 'r':
case 't':
// Return that character unchanged
append_yylval_string('\\');
shiftChar();
break;
// Line continuation
case ' ':
case '\r':
case '\n':
readLineContinuation();
continue;
// Macro arg
case '@':
case '#':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '<':
shiftChar();
str = readMacroArg(c);
if (str && str[0])
i = appendEscapedSubstring(str, i);
continue; // Do not copy an additional character
case EOF: // Can't really print that one
error("Illegal character escape at end of input\n");
c = '\\';
break;
case ',': // `\,` inside a macro arg string literal
warning(WARNING_OBSOLETE,
"`\\,` is deprecated inside strings\n");
shiftChar();
break;
default:
error("Illegal character escape %s\n", printChar(c));
shiftChar();
break;
}
break;
case '{': // Symbol interpolation
// We'll be exiting the string scope, so re-enable expansions
// (Not interpolations, since they're handled by the function itself...)
lexerState->disableMacroArgs = false;
str = readInterpolation(0);
if (str && str[0])
i = appendEscapedSubstring(str, i);
lexerState->disableMacroArgs = true;
continue; // Do not copy an additional character
// Regular characters will just get copied
}
append_yylval_string(c);
}
finish:
if (i == sizeof(yylval.string)) {
i--;
warning(WARNING_LONG_STR, "String constant too long\n");
}
yylval.string[i] = '\0';
lexerState->disableMacroArgs = false;
lexerState->disableInterpolation = false;
return i;
}
// Lexer core
static int yylex_SKIP_TO_ENDC(void); // forward declaration for yylex_NORMAL
static int yylex_NORMAL(void)
{
uint32_t num = 0;
for (;;) {
int c = nextChar();
char secondChar;
switch (c) {
// Ignore whitespace and comments
case ';':
discardComment();
// fallthrough
case ' ':
case '\t':
break;
// Handle unambiguous single-char tokens
case '~':
return T_OP_NOT;
case '@':
yylval.symName[0] = '@';
yylval.symName[1] = '\0';
return T_ID;
case '[':
return T_LBRACK;
case ']':
return T_RBRACK;
case '(':
return T_LPAREN;
case ')':
return T_RPAREN;
case ',':
return T_COMMA;
// Handle ambiguous 1- or 2-char tokens
case '+': // Either += or ADD
if (peek() == '=') {
shiftChar();
return T_POP_ADDEQ;
}
return T_OP_ADD;
case '-': // Either -= or SUB
if (peek() == '=') {
shiftChar();
return T_POP_SUBEQ;
}
return T_OP_SUB;
case '*': // Either *=, MUL, or EXP
switch (peek()) {
case '=':
shiftChar();
return T_POP_MULEQ;
case '*':
shiftChar();
return T_OP_EXP;
default:
return T_OP_MUL;
}
case '/': // Either /=, DIV, or a block comment
switch (peek()) {
case '=':
shiftChar();
return T_POP_DIVEQ;
case '*':
shiftChar();
discardBlockComment();
break;
default:
return T_OP_DIV;
}
break;
case '|': // Either |=, binary OR, or logical OR
switch (peek()) {
case '=':
shiftChar();
return T_POP_OREQ;
case '|':
shiftChar();
return T_OP_LOGICOR;
default:
return T_OP_OR;
}
case '^': // Either ^= or XOR
if (peek() == '=') {
shiftChar();
return T_POP_XOREQ;
}
return T_OP_XOR;
case '=': // Either assignment or EQ
if (peek() == '=') {
shiftChar();
return T_OP_LOGICEQU;
}
return T_POP_EQUAL;
case '!': // Either a NEQ or negation
if (peek() == '=') {
shiftChar();
return T_OP_LOGICNE;
}
return T_OP_LOGICNOT;
// Handle ambiguous 1-, 2-, or 3-char tokens
case '<': // Either <<=, LT, LTE, or left shift
switch (peek()) {
case '=':
shiftChar();
return T_OP_LOGICLE;
case '<':
shiftChar();
if (peek() == '=') {
shiftChar();
return T_POP_SHLEQ;
}
return T_OP_SHL;
default:
return T_OP_LOGICLT;
}
case '>': // Either >>=, GT, GTE, or either kind of right shift
switch (peek()) {
case '=':
shiftChar();
return T_OP_LOGICGE;
case '>':
shiftChar();
switch (peek()) {
case '=':
shiftChar();
return T_POP_SHREQ;
case '>':
shiftChar();
return T_OP_USHR;
default:
return T_OP_SHR;
}
default:
return T_OP_LOGICGT;
}
// Handle colon, which may begin an anonymous label ref
case ':':
c = peek();
if (c != '+' && c != '-')
return T_COLON;
readAnonLabelRef(c);
return T_ANON;
// Handle numbers
case '0': // Decimal or fixed-point number
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
num = readNumber(10, c - '0');
if (peek() == '.') {
shiftChar();
yylval.constValue = readFractionalPart(num);
} else {
yylval.constValue = num;
}
return T_NUMBER;
case '&': // Either &=, binary AND, logical AND, or an octal constant
secondChar = peek();
if (secondChar == '=') {
shiftChar();
return T_POP_ANDEQ;
} else if (secondChar == '&') {
shiftChar();
return T_OP_LOGICAND;
} else if (secondChar >= '0' && secondChar <= '7') {
yylval.constValue = readNumber(8, 0);
return T_NUMBER;
}
return T_OP_AND;
case '%': // Either %=, MOD, or a binary constant
secondChar = peek();
if (secondChar == '=') {
shiftChar();
return T_POP_MODEQ;
} else if (secondChar == binDigits[0] || secondChar == binDigits[1]) {
yylval.constValue = readBinaryNumber();
return T_NUMBER;
}
return T_OP_MOD;
case '$': // Hex constant
yylval.constValue = readHexNumber();
return T_NUMBER;
case '`': // Gfx constant
yylval.constValue = readGfxConstant();
return T_NUMBER;
// Handle strings
case '"':
readString();
return T_STRING;
// Handle newlines and EOF
case '\r':
handleCRLF(c);
// fallthrough
case '\n':
return T_NEWLINE;
case EOF:
return T_EOF;
// Handle line continuations
case '\\':
// Macro args were handled by `peek`, and character escapes do not exist
// outside of string literals, so this must be a line continuation.
readLineContinuation();
break;
// Handle identifiers... or report garbage characters
default:
if (startsIdentifier(c)) {
int tokenType = readIdentifier(c);
// An ELIF after a taken IF needs to not evaluate its condition
if (tokenType == T_POP_ELIF && lexerState->lastToken == T_NEWLINE
&& lexer_GetIFDepth() > 0 && lexer_RanIFBlock()
&& !lexer_ReachedELSEBlock())
return yylex_SKIP_TO_ENDC();
// If a keyword, don't try to expand
if (tokenType != T_ID && tokenType != T_LOCAL_ID)
return tokenType;
// Local symbols cannot be string expansions
if (tokenType == T_ID && lexerState->expandStrings) {
// Attempt string expansion
struct Symbol const *sym = sym_FindExactSymbol(yylval.symName);
if (sym && sym->type == SYM_EQUS) {
char const *s = sym_GetStringValue(sym);
assert(s);
if (s[0])
beginExpansion(s, false, sym->name);
continue; // Restart, reading from the new buffer
}
}
if (tokenType == T_ID && (lexerState->atLineStart || peek() == ':'))
return T_LABEL;
return tokenType;
}
// Do not report weird characters when capturing, it'll be done later
if (!lexerState->capturing) {
// TODO: try to group reportings
error("Unknown character %s\n", printChar(c));
}
}
lexerState->atLineStart = false;
}
}
static int yylex_RAW(void)
{
// This is essentially a modified `appendStringLiteral`
size_t parenDepth = 0;
size_t i = 0;
int c;
// Trim left whitespace (stops at a block comment)
for (;;) {
c = peek();
if (isWhitespace(c)) {
shiftChar();
} else if (c == '\\') {
shiftChar();
c = peek();
// If not a line continuation, handle as a normal char
if (!isWhitespace(c) && c != '\n' && c != '\r')
goto backslash;
// Line continuations count as "whitespace"
readLineContinuation();
} else {
break;
}
}
for (;;) {
c = peek();
switch (c) {
case '"': // String literals inside macro args
shiftChar();
i = appendStringLiteral(i);
break;
case ';': // Comments inside macro args
discardComment();
c = peek();
// fallthrough
case '\r': // End of line
case '\n':
case EOF:
goto finish;
case '/': // Block comments inside macro args
shiftChar();
if (peek() == '*') {
shiftChar();
discardBlockComment();
continue;
}
append_yylval_string(c); // Append the slash
break;
case ',': // End of macro arg
if (parenDepth == 0)
goto finish;
goto append;
case '(': // Open parentheses inside macro args
if (parenDepth < UINT_MAX)
parenDepth++;
goto append;
case ')': // Close parentheses inside macro args
if (parenDepth > 0)
parenDepth--;
goto append;
case '\\': // Character escape
shiftChar();
c = peek();
backslash:
switch (c) {
case ',': // Escapes only valid inside a macro arg
case '(':
case ')':
case '\\': // Escapes shared with string literals
case '"':
case '{':
case '}':
break;
case 'n':
c = '\n';
break;
case 'r':
c = '\r';
break;
case 't':
c = '\t';
break;
case ' ':
case '\r':
case '\n':
readLineContinuation();
continue;
case EOF: // Can't really print that one
error("Illegal character escape at end of input\n");
c = '\\';
break;
// Macro args were already handled by peek, so '\@',
// '\#', and '\0'-'\9' should not occur here.
default:
error("Illegal character escape %s\n", printChar(c));
break;
}
// fallthrough
default: // Regular characters will just get copied
append:
append_yylval_string(c);
shiftChar();
break;
}
}
finish:
if (i == sizeof(yylval.string)) {
i--;
warning(WARNING_LONG_STR, "Macro argument too long\n");
}
// Trim right whitespace
while (i && isWhitespace(yylval.string[i - 1]))
i--;
yylval.string[i] = '\0';
// Returning T_COMMAs to the parser would mean that two consecutive commas
// (i.e. an empty argument) need to return two different tokens (T_STRING
// then T_COMMA) without advancing the read. To avoid this, commas in raw
// mode end the current macro argument but are not tokenized themselves.
if (c == ',') {
shiftChar();
return T_STRING;
}
// The last argument may end in a trailing comma, newline, or EOF.
// To allow trailing commas, raw mode will continue after the last
// argument, immediately lexing the newline or EOF again (i.e. with
// an empty raw string before it). This will not be treated as a
// macro argument. To pass an empty last argument, use a second
// trailing comma.
if (i > 0)
return T_STRING;
lexer_SetMode(LEXER_NORMAL);
if (c == '\r' || c == '\n') {
shiftChar();
handleCRLF(c);
return T_NEWLINE;
}
return T_EOF;
}
#undef append_yylval_string
// This function uses the fact that `if`, etc. constructs are only valid when
// there's nothing before them on their lines. This enables filtering
// "meaningful" (= at line start) vs. "meaningless" (everything else) tokens.
// It's especially important due to macro args not being handled in this
// state, and lexing them in "normal" mode potentially producing such tokens.
static int skipIfBlock(bool toEndc)
{
lexer_SetMode(LEXER_NORMAL);
uint32_t startingDepth = lexer_GetIFDepth();
int token;
bool atLineStart = lexerState->atLineStart;
// Prevent expanding macro args and symbol interpolation in this state
lexerState->disableMacroArgs = true;
lexerState->disableInterpolation = true;
for (;;) {
if (atLineStart) {
int c;
for (;; shiftChar()) {
c = peek();
if (!isWhitespace(c))
break;
}
if (startsIdentifier(c)) {
shiftChar();
token = readIdentifier(c);
switch (token) {
case T_POP_IF:
lexer_IncIFDepth();
break;
case T_POP_ELIF:
if (lexer_ReachedELSEBlock())
fatalerror("Found ELIF after an ELSE block\n");
goto maybeFinish;
case T_POP_ELSE:
if (lexer_ReachedELSEBlock())
fatalerror("Found ELSE after an ELSE block\n");
lexer_ReachELSEBlock();
// fallthrough
maybeFinish:
if (toEndc) // Ignore ELIF and ELSE, go to ENDC
break;
// fallthrough
case T_POP_ENDC:
if (lexer_GetIFDepth() == startingDepth)
goto finish;
if (token == T_POP_ENDC)
lexer_DecIFDepth();
}
}
atLineStart = false;
}
// Read chars until EOL
do {
int c = nextChar();
if (c == EOF) {
token = T_EOF;
goto finish;
} else if (c == '\\') {
// Unconditionally skip the next char, including line conts
c = nextChar();
} else if (c == '\r' || c == '\n') {
atLineStart = true;
}
if (c == '\r' || c == '\n') {
// Handle CRLF before nextLine() since shiftChar updates colNo
handleCRLF(c);
// Do this both on line continuations and plain EOLs
nextLine();
}
} while (!atLineStart);
}
finish:
lexerState->disableMacroArgs = false;
lexerState->disableInterpolation = false;
lexerState->atLineStart = false;
return token;
}
static int yylex_SKIP_TO_ELIF(void)
{
return skipIfBlock(false);
}
static int yylex_SKIP_TO_ENDC(void)
{
return skipIfBlock(true);
}
static int yylex_SKIP_TO_ENDR(void)
{
lexer_SetMode(LEXER_NORMAL);
int depth = 1;
bool atLineStart = lexerState->atLineStart;
// Prevent expanding macro args and symbol interpolation in this state
lexerState->disableMacroArgs = true;
lexerState->disableInterpolation = true;
for (;;) {
if (atLineStart) {
int c;
for (;;) {
c = peek();
if (!isWhitespace(c))
break;
shiftChar();
}
if (startsIdentifier(c)) {
shiftChar();
switch (readIdentifier(c)) {
case T_POP_FOR:
case T_POP_REPT:
depth++;
break;
case T_POP_ENDR:
depth--;
if (!depth)
goto finish;
break;
case T_POP_IF:
lexer_IncIFDepth();
break;
case T_POP_ENDC:
lexer_DecIFDepth();
}
}
atLineStart = false;
}
// Read chars until EOL
do {
int c = nextChar();
if (c == EOF) {
goto finish;
} else if (c == '\\') {
// Unconditionally skip the next char, including line conts
c = nextChar();
} else if (c == '\r' || c == '\n') {
atLineStart = true;
}
if (c == '\r' || c == '\n') {
// Handle CRLF before nextLine() since shiftChar updates colNo
handleCRLF(c);
// Do this both on line continuations and plain EOLs
nextLine();
}
} while (!atLineStart);
}
finish:
lexerState->disableMacroArgs = false;
lexerState->disableInterpolation = false;
lexerState->atLineStart = false;
// yywrap() will finish the REPT/FOR loop
return T_EOF;
}
int yylex(void)
{
if (lexerState->atLineStart && lexerStateEOL) {
lexer_SetState(lexerStateEOL);
lexerStateEOL = NULL;
}
// `lexer_SetState` updates `lexerState`, so check for EOF after it
if (lexerState->lastToken == T_EOB && yywrap())
return T_EOF;
// Newlines read within an expansion should not increase the line count
if (lexerState->atLineStart && !lexerState->expansions)
nextLine();
static int (* const lexerModeFuncs[])(void) = {
[LEXER_NORMAL] = yylex_NORMAL,
[LEXER_RAW] = yylex_RAW,
[LEXER_SKIP_TO_ELIF] = yylex_SKIP_TO_ELIF,
[LEXER_SKIP_TO_ENDC] = yylex_SKIP_TO_ENDC,
[LEXER_SKIP_TO_ENDR] = yylex_SKIP_TO_ENDR,
};
int token = lexerModeFuncs[lexerState->mode]();
// Captures end at their buffer's boundary no matter what
if (token == T_EOF && !lexerState->capturing)
token = T_EOB;
lexerState->lastToken = token;
lexerState->atLineStart = token == T_NEWLINE || token == T_EOB;
return token;
}
static void startCapture(struct CaptureBody *capture)
{
assert(!lexerState->capturing);
lexerState->capturing = true;
lexerState->captureSize = 0;
lexerState->disableMacroArgs = true;
lexerState->disableInterpolation = true;
capture->lineNo = lexer_GetLineNo();
if (lexerState->isMmapped && !lexerState->expansions) {
capture->body = &lexerState->ptr[lexerState->offset];
} else {
lexerState->captureCapacity = 128; // The initial size will be twice that
assert(lexerState->captureBuf == NULL);
reallocCaptureBuf();
capture->body = NULL; // Indicate to retrieve the capture buffer when done capturing
}
}
static void endCapture(struct CaptureBody *capture)
{
// This being NULL means we're capturing from the capture buf, which is `realloc`'d during
// the whole capture process, and so MUST be retrieved at the end
if (!capture->body)
capture->body = lexerState->captureBuf;
capture->size = lexerState->captureSize;
lexerState->capturing = false;
lexerState->captureBuf = NULL;
lexerState->disableMacroArgs = false;
lexerState->disableInterpolation = false;
}
bool lexer_CaptureRept(struct CaptureBody *capture)
{
startCapture(capture);
size_t depth = 0;
int c = EOF;
// Due to parser internals, it reads the EOL after the expression before calling this.
// Thus, we don't need to keep one in the buffer afterwards.
// The following assertion checks that.
assert(lexerState->atLineStart);
for (;;) {
nextLine();
// We're at line start, so attempt to match a `REPT` or `ENDR` token
do { // Discard initial whitespace
c = nextChar();
} while (isWhitespace(c));
// Now, try to match `REPT`, `FOR` or `ENDR` as a **whole** identifier
if (startsIdentifier(c)) {
switch (readIdentifier(c)) {
case T_POP_REPT:
case T_POP_FOR:
depth++;
// Ignore the rest of that line
break;
case T_POP_ENDR:
if (!depth) {
// The final ENDR has been captured, but we don't want it!
// We know we have read exactly "ENDR", not e.g. an EQUS
lexerState->captureSize -= strlen("ENDR");
goto finish;
}
depth--;
}
}
// Just consume characters until EOL or EOF
for (;; c = nextChar()) {
if (c == EOF) {
error("Unterminated REPT/FOR block\n");
goto finish;
} else if (c == '\n' || c == '\r') {
handleCRLF(c);
break;
}
}
}
finish:
endCapture(capture);
// ENDR or EOF puts us past the start of the line
lexerState->atLineStart = false;
// Returns true if an ENDR terminated the block, false if it reached EOF first
return c != EOF;
}
bool lexer_CaptureMacroBody(struct CaptureBody *capture)
{
startCapture(capture);
// If the file is `mmap`ed, we need not to unmap it to keep access to the macro
if (lexerState->isMmapped)
lexerState->isReferenced = true;
int c = EOF;
// Due to parser internals, it reads the EOL after the expression before calling this.
// Thus, we don't need to keep one in the buffer afterwards.
// The following assertion checks that.
assert(lexerState->atLineStart);
for (;;) {
nextLine();
// We're at line start, so attempt to match an `ENDM` token
do { // Discard initial whitespace
c = nextChar();
} while (isWhitespace(c));
// Now, try to match `ENDM` as a **whole** identifier
if (startsIdentifier(c)) {
switch (readIdentifier(c)) {
case T_POP_ENDM:
// The ENDM has been captured, but we don't want it!
// We know we have read exactly "ENDM", not e.g. an EQUS
lexerState->captureSize -= strlen("ENDM");
goto finish;
}
}
// Just consume characters until EOL or EOF
for (;; c = nextChar()) {
if (c == EOF) {
error("Unterminated macro definition\n");
goto finish;
} else if (c == '\n' || c == '\r') {
handleCRLF(c);
break;
}
}
}
finish:
endCapture(capture);
// ENDM or EOF puts us past the start of the line
lexerState->atLineStart = false;
// Returns true if an ENDM terminated the block, false if it reached EOF first
return c != EOF;
}