shithub: rgbds

Download patch

ref: c10345f26d2aa370bc35d73ac5c81ea85beb788e
parent: 6fd5c94b27aff9adc2d2fb9428177605f2d3f552
author: Eldred Habert <eldredhabert0@gmail.com>
date: Sat Oct 1 14:35:00 EDT 2022

Comply with sym file spec (#1078)

Co-authored-by: Rangi <35663410+Rangi42@users.noreply.github.com>

--- a/Makefile
+++ b/Makefile
@@ -97,6 +97,7 @@
 	src/link/section.o \
 	src/link/symbol.o \
 	src/extern/getopt.o \
+	src/extern/utf8decoder.o \
 	src/error.o \
 	src/hashmap.o \
 	src/linkdefs.o \
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -84,6 +84,7 @@
     "link/sdas_obj.c"
     "link/section.c"
     "link/symbol.c"
+    "extern/utf8decoder.c"
     "hashmap.c"
     "linkdefs.c"
     "opmath.c"
--- a/src/link/output.c
+++ b/src/link/output.c
@@ -17,6 +17,8 @@
 #include "link/section.h"
 #include "link/symbol.h"
 
+#include "extern/utf8decoder.h"
+
 #include "error.h"
 #include "linkdefs.h"
 #include "platform.h" // MIN_NB_ELMS
@@ -273,6 +275,55 @@
 	return (*s1)->section->org < (*s2)->section->org ? s1 : s2;
 }
 
+// Checks whether this character is legal as the first character of a symbol's name in a sym file
+static bool canStartSymName(char c)
+{
+	return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_';
+}
+
+// Checks whether this character is legal in a symbol's name in a sym file
+static bool isLegalForSymName(char c)
+{
+	return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') ||
+	       c == '_' || c == '@' || c == '#' || c == '$' || c == '.';
+}
+
+// Prints a symbol's name to `symFile`, assuming that the first character is legal.
+// Illegal characters are UTF-8-decoded (errors are replaced by U+FFFD) and emitted as `\u`/`\U`.
+static void printSymName(char const *name)
+{
+	for (char const *ptr = name; *ptr != '\0'; ) {
+		char c = *ptr;
+
+		if (isLegalForSymName(c)) {
+			// Output legal ASCII characters as-is
+			fputc(c, symFile);
+			++ptr;
+		} else {
+			// Output illegal characters using Unicode escapes
+			// Decode the UTF-8 codepoint; or at least attempt to
+			uint32_t state = 0, codepoint;
+
+			do {
+				decode(&state, &codepoint, *ptr);
+				if (state == 1) {
+					// This sequence was invalid; emit a U+FFFD, and recover
+					codepoint = 0xFFFD;
+					// Skip continuation bytes
+					// A NUL byte does not qualify, so we're good
+					while ((*ptr & 0xC0) == 0x80)
+						++ptr;
+					break;
+				}
+				++ptr;
+			} while (state != 0);
+
+			fprintf(symFile, codepoint <= 0xFFFF ? "\\u%04" PRIx32 : "\\U%08" PRIx32,
+				codepoint);
+		}
+	}
+}
+
 // Comparator function for `qsort` to sort symbols
 // Symbols are ordered by address, or else by original index for a stable sort
 static int compareSymbols(void const *a, void const *b)
@@ -296,16 +347,22 @@
 	if (!symFile)
 		return;
 
+#define forEachSortedSection(sect, ...) do { \
+	for (struct SortedSection const *ssp = bankSections->zeroLenSections; ssp; ssp = ssp->next) { \
+		for (struct Section const *sect = ssp->section; sect; sect = sect->nextu) \
+			__VA_ARGS__ \
+	} \
+	for (struct SortedSection const *ssp = bankSections->sections; ssp; ssp = ssp->next) { \
+		for (struct Section const *sect = ssp->section; sect; sect = sect->nextu) \
+			__VA_ARGS__ \
+	} \
+} while (0)
+
 	uint32_t nbSymbols = 0;
 
-	for (struct SortedSection const *ptr = bankSections->zeroLenSections; ptr; ptr = ptr->next) {
-		for (struct Section const *sect = ptr->section; sect; sect = sect->nextu)
-			nbSymbols += sect->nbSymbols;
-	}
-	for (struct SortedSection const *ptr = bankSections->sections; ptr; ptr = ptr->next) {
-		for (struct Section const *sect = ptr->section; sect; sect = sect->nextu)
-			nbSymbols += sect->nbSymbols;
-	}
+	forEachSortedSection(sect, {
+		nbSymbols += sect->nbSymbols;
+	});
 
 	if (!nbSymbols)
 		return;
@@ -315,30 +372,22 @@
 	if (!symList)
 		err("Failed to allocate symbol list");
 
-	uint32_t idx = 0;
+	nbSymbols = 0;
 
-	for (struct SortedSection const *ptr = bankSections->zeroLenSections; ptr; ptr = ptr->next) {
-		for (struct Section const *sect = ptr->section; sect; sect = sect->nextu) {
-			for (uint32_t i = 0; i < sect->nbSymbols; i++) {
-				symList[idx].idx = idx;
-				symList[idx].sym = sect->symbols[i];
-				symList[idx].addr = symList[idx].sym->offset + sect->org;
-				idx++;
-			}
+	forEachSortedSection(sect, {
+		for (uint32_t i = 0; i < sect->nbSymbols; i++) {
+			if (!canStartSymName(sect->symbols[i]->name[0]))
+				// Don't output symbols that begin with an illegal character
+				continue;
+			symList[nbSymbols].idx = nbSymbols;
+			symList[nbSymbols].sym = sect->symbols[i];
+			symList[nbSymbols].addr = symList[nbSymbols].sym->offset + sect->org;
+			nbSymbols++;
 		}
-	}
-	for (struct SortedSection const *ptr = bankSections->sections; ptr; ptr = ptr->next) {
-		for (struct Section const *sect = ptr->section; sect; sect = sect->nextu) {
-			for (uint32_t i = 0; i < sect->nbSymbols; i++) {
-				symList[idx].idx = idx;
-				symList[idx].sym = sect->symbols[i];
-				symList[idx].addr = symList[idx].sym->offset + sect->org;
-				idx++;
-			}
-		}
-	}
-	assert(idx == nbSymbols);
+	});
 
+#undef forEachSortedSection
+
 	qsort(symList, nbSymbols, sizeof(*symList), compareSymbols);
 
 	uint32_t symBank = bank + sectionTypeInfo[type].firstBank;
@@ -346,11 +395,13 @@
 	for (uint32_t i = 0; i < nbSymbols; i++) {
 		struct SortedSymbol *sym = &symList[i];
 
-		fprintf(symFile, "%02" PRIx32 ":%04" PRIx16 " %s\n",
-			symBank, sym->addr, sym->sym->name);
+		fprintf(symFile, "%02" PRIx32 ":%04" PRIx16 " ", symBank, sym->addr);
+		printSymName(sym->sym->name);
+		fputc('\n', symFile);
 	}
 
 	free(symList);
+
 }
 
 /*