ref: 2346ea488600e7e735c2275e5bcd310bbbf9810c
dir: /sys/src/cmd/deroff.c/
#include <u.h>
#include <libc.h>
#include <bio.h>
/*
 * Deroff command -- strip troff, eqn, and tbl sequences from
 * a file.  Has three flags argument, -w, to cause output one word per line
 * rather than in the original format.
 * -mm (or -ms) causes the corresponding macro's to be interpreted
 * so that just sentences are output
 * -ml  also gets rid of lists.
 * -i causes deroff to ignore .so and .nx commands.
 * Deroff follows .so and .nx commands, removes contents of macro
 * definitions, equations (both .EQ ... .EN and $...$),
 * Tbl command sequences, and Troff backslash vconstructions.
 * 
 * All input is through the C macro; the most recently read character is in c.
 */
/*
#define	C	((c = Bgetrune(infile)) < 0?\
			eof():\
			((c == ldelim) && (filesp == files)?\
				skeqn():\
				(c == '\n'?\
					(linect++,c):\
						c)))
#define	C1	((c = Bgetrune(infile)) == Beof?\
			eof():\
			(c == '\n'?\
				(linect++,c):\
				c))
*/
/* lose those macros! */
#define	C	fC()
#define	C1	fC1()
#define	SKIP	while(C != '\n') 
#define SKIP1	while(C1 != '\n')
#define SKIP_TO_COM		SKIP;\
				SKIP;\
				pc=c;\
				while(C != '.' || pc != '\n' || C > 'Z')\
						pc=c
#define YES		1
#define NO		0
#define MS		0
#define MM		1
#define ONE		1
#define TWO		2
#define NOCHAR		-2
#define	EXTENDED	-1		/* All runes above 0x7F */
#define SPECIAL		0
#define APOS		1
#define PUNCT		2
#define DIGIT		3
#define LETTER		4
int	linect	= 0;
int	wordflag= NO;
int	underscoreflag = NO;
int	msflag	= NO;
int	iflag	= NO;
int	mac	= MM;
int	disp	= 0;
int	inmacro	= NO;
int	intable	= NO;
int	eqnflag	= 0;
#define	MAX_ASCII	0X80
char	chars[MAX_ASCII];	/* SPECIAL, PUNCT, APOS, DIGIT, or LETTER */
Rune	line[30000];
Rune*	lp;
long	c;
long	pc;
int	ldelim	= NOCHAR;
int	rdelim	= NOCHAR;
char**	argv;
char	fname[50];
Biobuf*	files[15];
Biobuf**filesp;
Biobuf*	infile;
char*	devnull	= "/dev/null";
Biobuf	*infile;
Biobuf	bout;
long	skeqn(void);
Biobuf*	opn(char *p);
int	eof(void);
int	charclass(int);
void	getfname(void);
void	fatal(char *s, char *p);
void	usage(void);
void	work(void);
void	putmac(Rune *rp, int vconst);
void	regline(int macline, int vconst);
void	putwords(void);
void	comline(void);
void	macro(void);
void	eqn(void);
void	tbl(void);
void	stbl(void);
void	sdis(char a1, char a2);
void	sce(void);
void	backsl(void);
char*	copys(char *s);
void	refer(int c1);
void	inpic(void);
int
fC(void)
{
	c = Bgetrune(infile);
	if(c < 0)
		return eof();
	if(c == ldelim && filesp == files)
		return skeqn();
	if(c == '\n')
		linect++;
	return c;
}
int
fC1(void)
{
	c = Bgetrune(infile);
	if(c == Beof)
		return eof();
	if(c == '\n')
		linect++;
	return c;
}
void
main(int argc, char *av[])
{
	int i;
	char *f;
	argv = av;
	Binit(&bout, 1, OWRITE);
	ARGBEGIN{
	case 'w':
		wordflag = YES;
		break;
	case '_':
		wordflag = YES;
		underscoreflag = YES;
		break;
	case 'm':
		msflag = YES;
		if(f = ARGF())
			switch(*f)
			{
			case 'm':	mac = MM; break;
			case 's':	mac = MS; break;
			case 'l':	disp = 1; break;
			default:	usage();
			}
		else
			usage();
		break;
	case 'i':
		iflag = YES;
		break;
	default:
		usage();
	}ARGEND
	if(*argv)
		infile = opn(*argv++);
	else{
		infile = malloc(sizeof(Biobuf));
		Binit(infile, 0, OREAD);
	}
	files[0] = infile;
	filesp = &files[0];
	for(i='a'; i<='z' ; ++i)
		chars[i] = LETTER;
	for(i='A'; i<='Z'; ++i)
		chars[i] = LETTER;
	for(i='0'; i<='9'; ++i)
		chars[i] = DIGIT;
	chars['\''] = APOS;
	chars['&'] = APOS;
	chars['\b'] = APOS;
	chars['.'] = PUNCT;
	chars[','] = PUNCT;
	chars[';'] = PUNCT;
	chars['?'] = PUNCT;
	chars[':'] = PUNCT;
	work();
}
long
skeqn(void)
{
	while(C1 != rdelim)
		if(c == '\\')
			c = C1;
		else if(c == '"')
			while(C1 != '"')
				if(c == '\\') 
					C1;
	if (msflag)
		eqnflag = 1;
	return(c = ' ');
}
Biobuf*
opn(char *p)
{
	Biobuf *fd;
	while ((fd = Bopen(p, OREAD)) == 0) {
		if(msflag || p == devnull)
			fatal("Cannot open file %s - quitting\n", p);
		else {
			fprint(2, "Deroff: Cannot open file %s - continuing\n", p);
			p = devnull;
		}
	}
	linect = 0;
	return(fd);
}
int
eof(void)
{
	if(Bfildes(infile) != 0)
		Bterm(infile);
	if(filesp > files)
		infile = *--filesp;
	else
	if(*argv)
		infile = opn(*argv++);
	else
		exits(0);
	return(C);
}
void
getfname(void)
{
	char *p;
	Rune r;
	Dir *dir;
	struct chain
	{ 
		struct	chain*	nextp; 
		char*	datap; 
	} *q;
	static struct chain *namechain= 0;
	while(C == ' ')
		;
	for(p = fname; (r=c) != '\n' && r != ' ' && r != '\t' && r != '\\'; C)
		p += runetochar(p, &r);
	*p = '\0';
	while(c != '\n')
		C;
	if(!strcmp(fname, "/sys/lib/tmac/tmac.cs")
			|| !strcmp(fname, "/sys/lib/tmac/tmac.s")) {
		fname[0] = '\0';
		return;
	}
	dir = dirstat(fname);
	if(dir!=nil && ((dir->mode & DMDIR) || dir->type != 'M')) {
		free(dir);
		fname[0] = '\0';
		return;
	}
	free(dir);
	/*
	 * see if this name has already been used
	 */
	for(q = namechain; q; q = q->nextp)
		if( !strcmp(fname, q->datap)) {
			fname[0] = '\0';
			return;
		}
	q = (struct chain*)malloc(sizeof(struct chain));
	q->nextp = namechain;
	q->datap = copys(fname);
	namechain = q;
}
void
usage(void)
{
	fprint(2,"usage: deroff [-nw_pi] [-m (m s l)] [file ...] \n");
	exits("usage");
}
void
fatal(char *s, char *p)
{
	fprint(2, "deroff: ");
	fprint(2, s, p);
	exits(s);
}
void
work(void)
{
	for(;;) {
		eqnflag = 0;
		if(C == '.'  ||  c == '\'')
			comline();
		else
			regline(NO, TWO);
	}
}
void
regline(int macline, int vconst)
{
	line[0] = c;
	lp = line;
	for(;;) {
		if(c == '\\') {
			*lp = ' ';
			backsl();
			if(c == '%')	/* no blank for hyphenation char */
				lp--;
		}
		if(c == '\n')
			break;
		if(intable && c=='T') {
			*++lp = C;
			if(c=='{' || c=='}') {
				lp[-1] = ' ';
				*lp = C;
			}
		} else {
			if(msflag == 1 && eqnflag == 1) {
				eqnflag = 0;
				*++lp = 'x';
			}
			*++lp = C;
		}
	}
	*lp = '\0';
	if(lp != line) {
		if(wordflag)
			putwords();
		else
		if(macline)
			putmac(line,vconst);
		else
			Bprint(&bout, "%S\n", line);
	}
}
void
putmac(Rune *rp, int vconst)
{
	Rune *t;
	int found;
	Rune last;
	found = 0;
	last = 0;
	while(*rp) {
		while(*rp == ' ' || *rp == '\t')
			Bputrune(&bout, *rp++);
		for(t = rp; *t != ' ' && *t != '\t' && *t != '\0'; t++)
			;
		if(*rp == '\"')
			rp++;
		if(t > rp+vconst && charclass(*rp) == LETTER
				&& charclass(rp[1]) == LETTER) {
			while(rp < t)
				if(*rp == '\"')
					rp++;
				else
					Bputrune(&bout, *rp++);
			last = t[-1];
			found++;
		} else
		if(found && charclass(*rp) == PUNCT && rp[1] == '\0')
			Bputrune(&bout, *rp++);
		else {
			last = t[-1];
			rp = t;
		}
	}
	Bputc(&bout, '\n');
	if(msflag && charclass(last) == PUNCT)
		Bprint(&bout, " %C\n", last);
}
/*
 * break into words for -w option
 */
void
putwords(void)
{
	Rune *p, *p1;
	int i, nlet;
	for(p1 = line;;) {
		/*
		 * skip initial specials ampersands and apostrophes
		 */
		while((i = charclass(*p1)) != EXTENDED && i < DIGIT)
			if(*p1++ == '\0')
				return;
		nlet = 0;
		for(p = p1; (i = charclass(*p)) != SPECIAL || (underscoreflag && *p=='_'); p++)
			if(i == LETTER || (underscoreflag && *p == '_'))
				nlet++;
		/*
		 * MDM definition of word
		 */
		if(nlet > 1) {
			/*
			 * delete trailing ampersands and apostrophes
			 */
			while(*--p == '\'' || *p == '&'
					   || charclass(*p) == PUNCT)
				;
			while(p1 <= p)
				Bputrune(&bout, *p1++);
			Bputc(&bout, '\n');
		} else
			p1 = p;
	}
}
void
comline(void)
{
	long c1, c2;
	while(C==' ' || c=='\t')
		;
comx:
	if((c1=c) == '\n')
		return;
	c2 = C;
	if(c1=='.' && c2!='.')
		inmacro = NO;
	if(msflag && c1 == '['){
		refer(c2);
		return;
	}
	if(c2 == '\n')
		return;
	if(c1 == '\\' && c2 == '\"')
		SKIP;
	else
	if (filesp==files && c1=='E' && c2=='Q')
			eqn();
	else
	if(filesp==files && c1=='T' && (c2=='S' || c2=='C' || c2=='&')) {
		if(msflag)
			stbl(); 
		else
			tbl();
	}
	else
	if(c1=='T' && c2=='E')
		intable = NO;
	else if (!inmacro &&
			((c1 == 'd' && c2 == 'e') ||
		   	 (c1 == 'i' && c2 == 'g') ||
		   	 (c1 == 'a' && c2 == 'm')))
				macro();
	else
	if(c1=='s' && c2=='o') {
		if(iflag)
			SKIP;
		else {
			getfname();
			if(fname[0]) {
				if(infile = opn(fname))
					*++filesp = infile;
				else infile = *filesp;
			}
		}
	}
	else
	if(c1=='n' && c2=='x')
		if(iflag)
			SKIP;
		else {
			getfname();
			if(fname[0] == '\0')
				exits(0);
			if(Bfildes(infile) != 0)
				Bterm(infile);
			infile = *filesp = opn(fname);
		}
	else
	if(c1 == 't' && c2 == 'm')
		SKIP;
	else
	if(c1=='h' && c2=='w')
		SKIP; 
	else
	if(msflag && c1 == 'T' && c2 == 'L') {
		SKIP_TO_COM;
		goto comx; 
	}
	else
	if(msflag && c1=='N' && c2 == 'R')
		SKIP;
	else
	if(msflag && c1 == 'A' && (c2 == 'U' || c2 == 'I')){
		if(mac==MM)SKIP;
		else {
			SKIP_TO_COM;
			goto comx; 
		}
	} else
	if(msflag && c1=='F' && c2=='S') {
		SKIP_TO_COM;
		goto comx; 
	}
	else
	if(msflag && (c1=='S' || c1=='N') && c2=='H') {
		SKIP_TO_COM;
		goto comx; 
	} else
	if(c1 == 'U' && c2 == 'X') {
		if(wordflag)
			Bprint(&bout, "UNIX\n");
		else
			Bprint(&bout, "UNIX ");
	} else
	if(msflag && c1=='O' && c2=='K') {
		SKIP_TO_COM;
		goto comx; 
	} else
	if(msflag && c1=='N' && c2=='D')
		SKIP;
	else
	if(msflag && mac==MM && c1=='H' && (c2==' '||c2=='U'))
		SKIP;
	else
	if(msflag && mac==MM && c2=='L') {
		if(disp || c1=='R')
			sdis('L', 'E');
		else {
			SKIP;
			Bprint(&bout, " .");
		}
	} else
	if(!msflag && c1=='P' && c2=='S') {
		inpic();
	} else
	if(msflag && (c1=='D' || c1=='N' || c1=='K'|| c1=='P') && c2=='S') { 
		sdis(c1, 'E'); 
	} else
	if(msflag && (c1 == 'K' && c2 == 'F')) { 
		sdis(c1,'E'); 
	} else
	if(msflag && c1=='n' && c2=='f')
		sdis('f','i');
	else
	if(msflag && c1=='c' && c2=='e')
		sce();
	else {
		if(c1=='.' && c2=='.') {
			if(msflag) {
				SKIP;
				return;
			}
			while(C == '.')
				;
		}
		inmacro++;
		if(c1 <= 'Z' && msflag)
			regline(YES,ONE);
		else {
			if(wordflag)
				C;
			regline(YES,TWO);
		}
		inmacro--;
	}
}
void
macro(void)
{
	if(msflag) {
		do { 
			SKIP1; 
		} while(C1 != '.' || C1 != '.' || C1 == '.');
		if(c != '\n')
			SKIP;
		return;
	}
	SKIP;
	inmacro = YES;
}
void
sdis(char a1, char a2)
{
	int c1, c2;
	int eqnf;
	int lct;
	if(a1 == 'P'){
		while(C1 == ' ')
			;
		if(c == '<') {
			SKIP1;
			return;
		}
	}
	lct = 0;
	eqnf = 1;
	if(c != '\n')
		SKIP1;
	for(;;) {
		while(C1 != '.')
			if(c == '\n')
				continue;
			else
				SKIP1;
		if((c1=C1) == '\n')
			continue;
		if((c2=C1) == '\n') {
			if(a1 == 'f' && (c1 == 'P' || c1 == 'H'))
				return;
			continue;
		}
		if(c1==a1 && c2 == a2) {
			SKIP1;
			if(lct != 0){
				lct--;
				continue;
			}
			if(eqnf)
				Bprint(&bout, " .");
			Bputc(&bout, '\n');
			return;
		} else
		if(a1 == 'L' && c2 == 'L') {
			lct++;
			SKIP1;
		} else
		if(a1 == 'D' && c1 == 'E' && c2 == 'Q') {
			eqn(); 
			eqnf = 0;
		} else
		if(a1 == 'f') {
			if((mac == MS && c2 == 'P') ||
				(mac == MM && c1 == 'H' && c2 == 'U')){
				SKIP1;
				return;
			}
			SKIP1;
		}
		else
			SKIP1;
	}
}
void
tbl(void)
{
	while(C != '.')
		;
	SKIP;
	intable = YES;
}
void
stbl(void)
{
	while(C != '.')
		;
	SKIP_TO_COM;
	if(c != 'T' || C != 'E') {
		SKIP;
		pc = c;
		while(C != '.' || pc != '\n' || C != 'T' || C != 'E')
			pc = c;
	}
}
void
eqn(void)
{
	long c1, c2;
	int dflg;
	char last;
	last = 0;
	dflg = 1;
	SKIP;
	for(;;) {
		if(C1 == '.'  || c == '\'') {
			while(C1==' ' || c=='\t')
				;
			if(c=='E' && C1=='N') {
				SKIP;
				if(msflag && dflg) {
					Bputc(&bout, 'x');
					Bputc(&bout, ' ');
					if(last) {
						Bputc(&bout, last); 
						Bputc(&bout, '\n'); 
					}
				}
				return;
			}
		} else
		if(c == 'd') {
			if(C1=='e' && C1=='l')
				if(C1=='i' && C1=='m') {
					while(C1 == ' ')
						;
					if((c1=c)=='\n' || (c2=C1)=='\n' ||
					  (c1=='o' && c2=='f' && C1=='f')) {
						ldelim = NOCHAR;
						rdelim = NOCHAR;
					} else {
						ldelim = c1;
						rdelim = c2;
					}
				}
			dflg = 0;
		}
		if(c != '\n')
			while(C1 != '\n') { 
				if(charclass(c) == PUNCT)
					last = c;
				else
				if(c != ' ')
					last = 0;
			}
	}
}
/*
 * skip over a complete backslash vconstruction
 */
void
backsl(void)
{
	int bdelim;
sw:  
	switch(C1)
	{
	case '"':
		SKIP1;
		return;
	case 's':
		if(C1 == '\\')
			backsl();
		else {
			while(C1>='0' && c<='9')
				;
			Bungetrune(infile);
			c = '0';
		}
		lp--;
		return;
	case 'f':
	case 'n':
	case '*':
		if(C1 != '(')
			return;
	case '(':
		if(msflag) {
			if(C == 'e') {
				if(C1 == 'm') {
					*lp = '-';
					return;
				}
			} else
			if(c != '\n')
				C1;
			return;
		}
		if(C1 != '\n')
			C1;
		return;
	case '$':
		C1;	/* discard argument number */
		return;
	case 'b':
	case 'x':
	case 'v':
	case 'h':
	case 'w':
	case 'o':
	case 'l':
	case 'L':
		if((bdelim=C1) == '\n')
			return;
		while(C1!='\n' && c!=bdelim)
			if(c == '\\')
				backsl();
		return;
	case '\\':
		if(inmacro)
			goto sw;
	default:
		return;
	}
}
char*
copys(char *s)
{
	char *t, *t0;
	if((t0 = t = malloc((strlen(s)+1))) == 0)
		fatal("Cannot allocate memory", (char*)0);
	while(*t++ = *s++)
		;
	return(t0);
}
void
sce(void)
{
	int n = 1;
	while (C != L'\n' && !(L'0' <= c && c <= L'9'))
		;
	if (c != L'\n') {
		for (n = c-L'0';'0' <= C && c <= L'9';)
			n = n*10 + c-L'0';
	}
	while(n) {
		if(C == '.') {
			if(C == 'c') {
				if(C == 'e') {
					while(C == ' ')
						;
					if(c == '0') {
						SKIP;
						break;
					} else
						SKIP;
				} else
					SKIP;
			} else
			if(c == 'P' || C == 'P') {
				if(c != '\n')
					SKIP;
				break;
			} else
				if(c != '\n')
					SKIP;
		} else {
			SKIP;
			n--;
		}
	}
}
void
refer(int c1)
{
	int c2;
	if(c1 != '\n')
		SKIP;
	c2 = 0;
	for(;;) {
		if(C != '.')
			SKIP;
		else {
			if(C != ']')
				SKIP;
			else {
				while(C != '\n')
					c2 = c;
				if(charclass(c2) == PUNCT)
					Bprint(&bout, " %C",c2);
				return;
			}
		}
	}
}
void
inpic(void)
{
	int c1;
	Rune *p1;
/*	SKIP1;*/
	while(C1 != '\n')
		if(c == '<'){
			SKIP1;
			return;
		}
	p1 = line;
	c = '\n';
	for(;;) {
		c1 = c;
		if(C1 == '.' && c1 == '\n') {
			if(C1 != 'P' || C1 != 'E') {
				if(c != '\n'){
					SKIP1;
					c = '\n';
				}
				continue;
			}
			SKIP1;
			return;
		} else
		if(c == '\"') {
			while(C1 != '\"') {
				if(c == '\\') {
					if(C1 == '\"')
						continue;
					Bungetrune(infile);
					backsl();
				} else
					*p1++ = c;
			}
			*p1++ = ' ';
		} else
		if(c == '\n' && p1 != line) {
			*p1 = '\0';
			if(wordflag)
				putwords();
			else
				Bprint(&bout, "%S\n\n", line);
			p1 = line;
		}
	}
}
int
charclass(int c)
{
	if(c < MAX_ASCII)
		return chars[c];
	switch(c){
	case 0x2013: case 0x2014:	/* en dash, em dash */
		return SPECIAL;
	}
	return EXTENDED;
}