shithub: riscv

ref: dde3628d54073e0569bf565e6ec861e42cfa35f6
dir: /sys/src/cmd/ndb/dnresolve.c/

View raw version
/*
 * domain name resolvers, see rfcs 1035 and 1123
 */
#include <u.h>
#include <libc.h>
#include <ip.h>
#include <bio.h>
#include <mp.h>
#include <libsec.h>
#include <ndb.h>
#include "dns.h"

typedef struct Dest Dest;
typedef struct Query Query;

enum
{
	Udp, Tcp,

	Answerr=	-1,
	Answnone,

	Maxdest=	32,	/* maximum destinations for a request message */

	/*
	 * these are the old values; we're trying longer timeouts now
	 * primarily for the benefit of remote nameservers querying us
	 * during times of bad connectivity.
	 */
	Maxtrans=	5,	/* maximum transmissions to a server */
	Maxretries=	10,	/* cname+actual resends: was 32; have pity on user */

	Maxtcpfree=	4,	/* maximum number of tcp connections kept alive */
	Maxtcpresuetm=	5000,	/* connection resue timeout */

};

struct Dest
{
	uchar	a[IPaddrlen];	/* ip address */
	DN	*s;		/* name server name */
	RR	*n;		/* name server rr */
	int	nx;		/* number of transmissions */
	int	code;		/* response code; used to clear dp->respcode */
};

struct Query {
	DN	*dp;		/* domain */
	ushort	type;		/* and type to look up */
	Request *req;
	Query	*prev;		/* previous query */

	int	depth;
	ushort	id;		/* request id */

	RR	*nsrp;		/* name servers to consult */
};

static RR*	dnresolve1(char*, int, int, Request*, int, int);
static int	netquery(Query *);

/*
 * reading /proc/pid/args yields either "name args" or "name [display args]",
 * so return only display args, if any.
 */
static char *
procgetname(void)
{
	int fd, n;
	char *lp, *rp;
	char buf[256];

	snprint(buf, sizeof buf, "/proc/%d/args", getpid());
	if((fd = open(buf, OREAD|OCEXEC)) < 0)
		return strdup("");
	*buf = '\0';
	n = read(fd, buf, sizeof buf-1);
	close(fd);
	if (n >= 0)
		buf[n] = '\0';
	if ((lp = strchr(buf, '[')) == nil ||
	    (rp = strrchr(buf, ']')) == nil)
		return strdup("");
	*rp = '\0';
	return strdup(lp+1);
}

static void
rrfreelistptr(RR **rpp)
{
	RR *rp;

	if (rpp == nil || *rpp == nil)
		return;
	rp = *rpp;
	*rpp = nil;	/* update pointer in memory before freeing list */
	rrfreelist(rp);
}

/*
 *  lookup 'type' info for domain name 'name'.  If it doesn't exist, try
 *  looking it up as a canonical name.
 *
 *  this process can be quite slow if time-outs are set too high when querying
 *  nameservers that just don't respond to certain query types.  in that case,
 *  there will be multiple udp retries, multiple nameservers will be queried,
 *  and this will be repeated for a cname query.  the whole thing will be
 *  retried several times until we get an answer or a time-out.
 */
RR*
dnresolve(char *name, int class, int type, Request *req, RR **cn, int depth,
	int recurse, int rooted, int *rcode)
{
	RR *rp, *nrp, *drp;
	DN *dp;
	int loops;
	char *procname;
	char nname[Domlen];

	if(rcode)
		*rcode = Rok;

	if(depth > 12)			/* in a recursive loop? */
		return nil;

	procname = procgetname();
	/*
	 *  hack for systems that don't have resolve search
	 *  lists.  Just look up the simple name in the database.
	 */
	if(!rooted && strchr(name, '.') == nil){
		rp = nil;
		drp = domainlist(class);
		for(nrp = drp; rp == nil && nrp != nil; nrp = nrp->next){
			snprint(nname, sizeof nname, "%s.%s", name,
				nrp->ptr->name);
			rp = dnresolve(nname, class, type, req, cn, depth+1,
				recurse, rooted, rcode);
			rrfreelist(rrremneg(&rp));
		}
		if(drp != nil)
			rrfreelist(drp);
		procsetname("%s", procname);
		free(procname);
		return rp;
	}

	/*
	 *  try the name directly
	 */
	rp = dnresolve1(name, class, type, req, depth, recurse);
	if(rp == nil && (dp = idnlookup(name, class, 0)) != nil) {
		/*
		 * try it as a canonical name if we weren't told
		 * that the name didn't exist
		 */
		if(type != Tptr && dp->respcode != Rname)
			for(loops = 0; rp == nil && loops < Maxretries; loops++){
				/* retry cname, then the actual type */
				rp = dnresolve1(name, class, Tcname, req,
					depth, recurse);
				if(rp == nil)
					break;

				/* rp->host == nil shouldn't happen, but does */
				if(rp->negative || rp->host == nil){
					rrfreelist(rp);
					rp = nil;
					break;
				}

				name = rp->host->name;
				if(cn)
					rrcat(cn, rp);
				else
					rrfreelist(rp);

				rp = dnresolve1(name, class, type, req,
					depth, recurse);
			}

		/* distinction between not found and not good */
		if(rp == nil && rcode != nil && dp->respcode != Rok)
			*rcode = dp->respcode;
	}
	procsetname("%s", procname);
	free(procname);
	return randomize(rp);
}

static void
initquery(Query *qp, DN *dp, int type, Request *req, int depth)
{
	assert(dp != nil);

	memset(qp, 0, sizeof *qp);
	qp->dp = dp;
	qp->type = type;
	qp->nsrp = nil;
	qp->depth = depth;
	qp->prev = req->aux;
	qp->req = req;
	req->aux = qp;
}

static void
exitquery(Query *qp)
{
	if(qp->req->aux == qp)
		qp->req->aux = qp->prev;
	memset(qp, 0, sizeof *qp);	/* prevent accidents */
}

/*
 * if the response to a query hasn't arrived within 100 ms.,
 * it's unlikely to arrive at all.  after 1 s., it's really unlikely.
 * queries for missing RRs are likely to produce time-outs rather than
 * negative responses, so cname and aaaa queries are likely to time out,
 * thus we don't wait very long for them.
 */
static void
notestats(long ms, int tmout, int type)
{
	if (tmout) {
		stats.tmout++;
		if (type == Taaaa)
			stats.tmoutv6++;
		else if (type == Tcname)
			stats.tmoutcname++;
	} else {
		long wait10ths = ms / 100;

		if (wait10ths <= 0)
			stats.under10ths[0]++;
		else if (wait10ths >= nelem(stats.under10ths))
			stats.under10ths[nelem(stats.under10ths) - 1]++;
		else
			stats.under10ths[wait10ths]++;
	}
}

static void
noteinmem(void)
{
	stats.answinmem++;
}

/* netquery with given name servers, free ns rrs when done */
static int
netqueryns(Query *qp, RR *nsrp)
{
	int rv;

	if(nsrp == nil)
		return Answnone;
	qp->nsrp = nsrp;
	rv = netquery(qp);
	qp->nsrp = nil;		/* prevent accidents */
	rrfreelist(nsrp);
	return rv;
}

static RR*
issuequery(Query *qp, char *name, int class, int recurse)
{
	char *cp;
	DN *nsdp;
	RR *rp, *nsrp, *dbnsrp;

	/*
	 *  if we're running as just a resolver, query our
	 *  designated name servers
	 */
	if(cfg.resolver){
		nsrp = randomize(getdnsservers(class));
		if(nsrp != nil){
			int dot = strncmp(nsrp->owner->name, "local#dot#server", 16) == 0;
			if(netqueryns(qp, nsrp) > Answnone)
				return rrlookup(qp->dp, qp->type, OKneg);
			else if(dot)
				return nil;	/* do not fall-back for DoT */
		}
	}

	/*
 	 *  walk up the domain name looking for
	 *  a name server for the domain.
	 */
	for(cp = name; cp; cp = walkup(cp)){
		/*
		 *  if this is a local (served by us) domain,
		 *  return answer
		 */
		dbnsrp = randomize(dblookup(cp, class, Tns, 0, 0));
		if(dbnsrp && dbnsrp->local){
			rp = dblookup(name, class, qp->type, 1, dbnsrp->ttl);
			rrfreelist(dbnsrp);
			return rp;
		}

		/*
		 *  if recursion isn't set, just accept local
		 *  entries
		 */
		if(recurse == Dontrecurse){
			if(dbnsrp)
				rrfreelist(dbnsrp);
			continue;
		}

		/* look for ns in cache */
		nsdp = idnlookup(cp, class, 0);
		nsrp = nil;
		if(nsdp)
			nsrp = randomize(rrlookup(nsdp, Tns, NOneg));

		/* if the entry timed out, ignore it */
		if(nsrp && !nsrp->db && (long)(nsrp->expire - now) <= 0)
			rrfreelistptr(&nsrp);

		if(nsrp){
			rrfreelistptr(&dbnsrp);

			/* hack: stop search if ns is authoritative in db */
			if(cfg.cachedb && nsrp->db && nsrp->auth)
				cp = "";

			/* query the name servers found in cache */
			if(netqueryns(qp, nsrp) > Answnone)
				return rrlookup(qp->dp, qp->type, OKneg);
		} else if(dbnsrp)
			/* try the name servers found in db */
			if(netqueryns(qp, dbnsrp) > Answnone)
				return rrlookup(qp->dp, qp->type, NOneg);
	}
	return nil;
}

static RR*
dnresolve1(char *name, int class, int type, Request *req, int depth, int recurse)
{
	Area *area;
	DN *dp;
	RR *rp;
	Query q;

	if(debug)
		dnslog("%d: dnresolve1 %s %d %d",
			req->id, name, type, class);

	/* only class Cin implemented so far */
	if(class != Cin)
		return nil;

	dp = idnlookup(name, class, 1);

	/*
	 *  Try the cache first
	 */
	rp = rrlookup(dp, type, OKneg);
	if(rp)
		if(rp->db){
			/* unauthoritative db entries are hints */
			if(rp->auth) {
				noteinmem();
				if(debug)
					dnslog("%d: dnresolve1 %s %d %d: auth rr in db",
						req->id, name, type, class);
				return rp;
			}
		} else
			/* cached entry must still be valid */
			if((long)(rp->expire - now) > 0)
				/* but Tall entries are special */
				if(type != Tall || rp->query == Tall) {
					noteinmem();
					if(debug)
						dnslog("%d: dnresolve1 %s %d %d: rr not in db",
							req->id, name, type, class);
					return rp;
				}
	rrfreelist(rp);
	rp = nil;		/* accident prevention */
	USED(rp);

	/*
	 * try the cache for a canonical name. if found punt
	 * since we'll find it during the canonical name search
	 * in dnresolve().
	 */
	if(type != Tcname){
		rp = rrlookup(dp, Tcname, NOneg);
		rrfreelist(rp);
		if(rp){
			if(debug)
				dnslog("%d: dnresolve1 %s %d %d: rr from rrlookup for non-cname",
					req->id, name, type, class);
			return nil;
		}
	}

	/*
	 * if the domain name is within an area of ours,
	 * we should have found its data in memory by now.
	 */
	area = inmyarea(dp->name);
	if (area || strncmp(dp->name, "local#", 6) == 0)
		return nil;

	initquery(&q, dp, type, req, depth);
	rp = issuequery(&q, name, class, recurse);
	exitquery(&q);

	if(rp){
		if(debug)
			dnslog("%d: dnresolve1 %s %d %d: rr from query",
				req->id, name, type, class);
		return rp;
	}

	/* settle for a non-authoritative answer */
	rp = rrlookup(dp, type, OKneg);
	if(rp){
		if(debug)
			dnslog("%d: dnresolve1 %s %d %d: rr from rrlookup",
				req->id, name, type, class);
		return rp;
	}

	/* noone answered.  try the database, we might have a chance. */
	rp = dblookup(name, class, type, 0, 0);
	if (rp) {
		if(debug)
			dnslog("%d: dnresolve1 %s %d %d: rr from dblookup",
				req->id, name, type, class);
	}else{
		if(debug)
			dnslog("%d: dnresolve1 %s %d %d: no rr from dblookup; crapped out",
				req->id, name, type, class);
	}
	return rp;
}

/*
 *  walk a domain name one element to the right.
 *  return a pointer to that element.
 *  in other words, return a pointer to the parent domain name.
 */
char*
walkup(char *name)
{
	char *cp;

	cp = strchr(name, '.');
	if(cp)
		return cp+1;
	else if(*name)
		return "";
	else
		return 0;
}

/*
 *  Get a udp port for sending requests and reading replies.
 *  Put the port into "headers" mode.
 */
int
udpport(char *mntpt)
{
	static char hmsg[] = "headers";
	static char imsg[] = "ignoreadvice";

	char adir[NETPATHLEN], buf[NETPATHLEN];
	int fd, ctl;

	/* get a udp port */
	snprint(buf, sizeof buf, "%s/udp!*!0", mntpt);
	ctl = announce(buf, adir);
	if(ctl < 0)
		return -1;

	/* turn on header style interface */
	if(write(ctl, hmsg, sizeof(hmsg)-1) < 0){
		warning("can't enable %s on %s: %r", hmsg, adir);
		close(ctl);
		return -1;
	}

	/* ignore ICMP advice */
	write(ctl, imsg, sizeof(imsg)-1);

	/* grab the data file */
	snprint(buf, sizeof buf, "%s/data", adir);
	fd = open(buf, ORDWR|OCEXEC);
	if(fd < 0)
		warning("can't open udp port %s: %r", buf);
	close(ctl);
	return fd;
}

static void
initdnsmsg(DNSmsg *mp, RR *rp, int flags, ushort reqno)
{
	memset(mp, 0, sizeof *mp);
	mp->flags = flags;
	mp->id = reqno;
	mp->qd = rp;
	if(rp != nil)
		mp->qdcount = 1;
}

RR*
getednsopt(DNSmsg *mp, int *rcode)
{
	RR *rp, *x;

	rp = rrremtype(&mp->ar, Topt);
	if(rp == nil)
		return nil;

	mp->arcount--;
	while((x = rp->next) != nil){
		rp->next = x->next;
		rrfree(x);
		mp->arcount--;
		*rcode = Rformat;
	}

	if(rp->eflags & Evers)
		*rcode = Rbadvers;

	if(rp->udpsize < 512)
		rp->udpsize = 512;

	return rp;
}

int
getercode(DNSmsg *mp)
{
	if(mp->edns == nil)
		return mp->flags & Rmask;
	return (mp->flags & 0xF) | (mp->edns->eflags & Ercode) >> 20;
}

void
setercode(DNSmsg *mp, int rcode)
{
	if(mp->edns){
		mp->edns->eflags = (mp->edns->eflags & ~Ercode) | ((rcode << 20) & Ercode);
		rcode &= 0xF;
	}
	mp->flags = (mp->flags & ~Rmask) | (rcode & Rmask);
}

RR*
mkednsopt(void)
{
	RR *rp;

	rp = rralloc(Topt);
	rp->owner = dnlookup("", Cin, 1);
	rp->eflags = 0;

	/*
	 * Advertise a safe UDP response size
	 * instead of Maxudp as that is just
	 * the worst case we can accept.
	 *
	 * 1232 = MTU(1280)-IPv6(40)-UDP(8).
	 */
	rp->udpsize = 1232;

	return rp;
}

/* generate a DNS UDP query packet, return size of request (without Udphdr) */
int
mkreq(DN *dp, int type, uchar *pkt, int flags, ushort id)
{
	Udphdr *uh = (Udphdr*)pkt;
	DNSmsg m;
	RR *rp;
	int len;

	/* stuff port number into output buffer */
	memset(uh, 0, Udphdrsize);
	uh->rport[1] = 53;

	/* make request and convert it to output format */
	rp = rralloc(type);
	rp->owner = dp;
	initdnsmsg(&m, rp, flags, id);
	m.edns = mkednsopt();
	len = convDNS2M(&m, &pkt[Udphdrsize], Maxudp);
	rrfreelist(m.edns);
	rrfreelist(rp);
	return len;
}

void
freeanswers(DNSmsg *mp)
{
	rrfreelistptr(&mp->qd);
	rrfreelistptr(&mp->an);
	rrfreelistptr(&mp->ns);
	rrfreelistptr(&mp->ar);
	mp->qdcount = mp->ancount = mp->nscount = mp->arcount = 0;
}

/* timed read of reply. sets srcip if UDP. */
static int
readnet(Query *qp, int medium, int fd, uchar pkt[Maxpkt], uvlong endms,
	uchar **replyp, uchar *srcip)
{
	int len;
	long ms;
	uvlong startms;
	uchar *reply;

	*replyp = nil;

	startms = nowms;
	ms = (long)(endms - startms);
	if (ms < 1)
		return -1;		/* taking too long */

	len = -1;			/* pessimism */
	reply = pkt;
	switch (medium) {
	case Udp:
		alarm(ms);
		len = read(fd, pkt, Udphdrsize + Maxudp);
		alarm(0);
		if(len < 0)
			break;
		if(len <= Udphdrsize){
			len = -1;
			break;
		}
		ipmove(srcip, pkt);
		len   -= Udphdrsize;
		reply += Udphdrsize;
		break;
	case Tcp:
		alarm(ms);
		len = readn(fd, pkt, 2);
		if(len < 0){
			alarm(0);
			break;
		}
		if(len != 2){
			alarm(0);
			dnslog("%d: readnet: short read of 2-byte tcp msg size from %I",
				qp->req->id, srcip);
			len = -1;
			break;
		}
		len = pkt[0]<<8 | pkt[1];
		if(len <= 0 || len > Maxtcp || readn(fd, pkt+2, len) != len){
			alarm(0);
			dnslog("%d: readnet: short read of tcp data from %I",
				qp->req->id, srcip);
			len = -1;
			break;
		}
		alarm(0);
		reply += 2;
		break;
	}

	/* file statistics */
	ms = (long)(timems() - startms);
	notestats(ms, len < 0, qp->type);

	*replyp = reply;
	return len;
}

/*
 *  read replies to a request and remember the rrs in the answer(s).
 *  ignore any of the wrong type for UDP.
 *  wait at most until endms.
 */
static int
readreply(Query *qp, int medium, int fd, uvlong endms,
	DNSmsg *mp, uchar *srcip)
{
	uchar pkt[Maxpkt];
	uchar *reply;
	char *err;
	int len;
	RR *rp;

	for(;;){
		len = readnet(qp, medium, fd, pkt, endms, &reply, srcip);
		if (len < 0)
			break;

		/* convert into internal format  */
		memset(mp, 0, sizeof *mp);
		err = convM2DNS(reply, len, mp, nil);
		if (mp->flags & Ftrunc) {
			free(err);
			return 1;	/* signal truncation */
		}
		if(err){
			dnslog("%d: readreply: input err, len %d: %s from %I",
				qp->req->id, len, err, srcip);
			free(err);
		} else {
			logreply(qp->req->id, "rcvd", srcip, mp);

			/* answering the right question? */
			if(mp->id != qp->id)
				dnslog("%d: id %d instead of %d from %I",
					qp->req->id, mp->id, qp->id, srcip);
			else if(mp->qd == 0)
				dnslog("%d: no question RR from %I", qp->req->id, srcip);
			else if(mp->qd->owner != qp->dp)
				dnslog("%d: owner %s instead of %s from %I", qp->req->id,
					mp->qd->owner->name, qp->dp->name, srcip);
			else if(mp->qd->type != qp->type)
				dnslog("%d: qp->type %d instead of %d from %I",
					qp->req->id, mp->qd->type, qp->type, srcip);
			else {
				/* remember what request this is in answer to */
				for(rp = mp->an; rp; rp = rp->next)
					rp->query = qp->type;
				return 0;
			}
		}
		freeanswers(mp);

		/* only single reply is expected from TCP */
		if(medium == Tcp)
			break;
	}
	memset(mp, 0, sizeof *mp);
	return -1;
}

/*
 *	return non-0 if first list includes second list
 */
static int
contains(RR *rp1, RR *rp2)
{
	RR *trp1, *trp2;

	for(trp2 = rp2; trp2; trp2 = trp2->next){
		for(trp1 = rp1; trp1; trp1 = trp1->next)
			if(trp1->type == trp2->type)
			if(trp1->host == trp2->host)
			if(trp1->owner == trp2->owner)
				break;
		if(trp1 == nil)
			return 0;
	}
	return 1;
}

/*
 *  return multicast version if any
 */
static int
ipisbm(uchar *ip)
{
	if(isv4(ip)){
		if (ip[IPv4off] >= 0xe0 && ip[IPv4off] < 0xf0 ||
		    ipcmp(ip, IPv4bcast) == 0)
			return 4;
	} else
		if(ip[0] == 0xff)
			return 6;
	return 0;
}

static int
queryloops(Query *qp, RR *rp)
{
	DN *ns = rp->host;

	/*
	 *  looking up a server under itself
	 */
	if(subsume(rp->owner->name, ns->name))
		return 1;

	/*
	 *  cycle on name servers refering
	 *  to each another.
	 */
	for(; qp; qp = qp->prev)
		if(qp->dp == ns)
			return 1;

	return 0;
}

/*
 *  Get next server type address(es) into dest[nd] and beyond
 */
static int
serveraddrs(Query *qp, Dest dest[Maxdest], int nd, int type)
{
	RR *rp, *arp, *trp;
	ulong mark;
	Dest *p;

	if(nd >= Maxdest)		/* dest array is full? */
		return Maxdest;

	/*
	 *  look for a server whose address we already know.
	 *  if we find one, mark it so we ignore this on
	 *  subsequent passes.
	 */
	mark = 1UL<<type;
	arp = nil;
	for(rp = qp->nsrp; rp; rp = rp->next){
		if(rp->marker & mark)
			continue;
		arp = rrlookup(rp->host, type, NOneg);
		if(arp){
			rp->marker |= mark;
			break;
		}
		arp = dblookup(rp->host->name, Cin, type, 0, 0);
		if(arp){
			rp->marker |= mark;
			break;
		}
	}

	/*
	 *  if the cache and database lookup didn't find any new
	 *  server addresses, try resolving one via the network.
	 *  Mark any we try to resolve so we don't try a second time.
	 */
	if(arp == nil){
		for(rp = qp->nsrp; rp; rp = rp->next)
			if((rp->marker & mark) == 0)
			if(queryloops(qp, rp))
				/*
				 * give up as we should have got the address
				 * by higher up nameserver when recursing
				 * down, or will be queried when recursing up.
				 */
				return nd;

		for(rp = qp->nsrp; rp; rp = rp->next){
			if(rp->marker & mark)
				continue;
			rp->marker |= mark;
			if(strncmp(rp->owner->name, "local#", 6) == 0)
				continue;
			arp = dnresolve(rp->host->name, Cin, type, qp->req, 0,
				qp->depth+1, Recurse, 1, 0);
			rrfreelist(rrremneg(&arp));
			if(arp)
				break;
		}
	}

	/* use any addresses that we found */
	for(trp = arp; trp && nd < Maxdest; trp = trp->next){
		p = &dest[nd];
		memset(p, 0, sizeof *p);
		if(parseip(p->a, trp->ip->name) == -1)
			continue;
		if(ipcmp(p->a, IPnoaddr) == 0)
			continue;
		if(ipisbm(p->a))
			continue;
		if(cfg.serve && myip(p->a))
			continue;

		p->nx = 0;
		p->n = nil;
		p->s = trp->owner;
		for(rp = qp->nsrp; rp; rp = rp->next){
			if(rp->host == p->s){
				p->n = rp;
				break;
			}
		}
		p->code = Rtimeout;
		nd++;
	}
	rrfreelist(arp);
	return nd;
}

/*
 *  cache negative responses
 */
static void
cacheneg(DN *dp, int type, int rcode, RR *soarr)
{
	RR *rp;
	DN *soaowner;
	ulong ttl;

	stats.negcached++;

	/* no cache time specified, don't make anything up */
	if(soarr != nil){
		if(soarr->next != nil)
			rrfreelistptr(&soarr->next);
		soaowner = soarr->owner;
	} else
		soaowner = nil;

	/* the attach can cause soarr to be freed so mine it now */
	if(soarr != nil && soarr->soa != nil)
		ttl = soarr->soa->minttl;
	else
		ttl = 5*Min;

	/* add soa and negative RR to the database */
	rrattach(soarr, Authoritative);

	rp = rralloc(type);
	rp->owner = dp;
	rp->negative = 1;
	rp->negsoaowner = soaowner;
	rp->negrcode = rcode;
	rp->ttl = ttl;
	rrattach(rp, Authoritative);
}

static int
filterhints(RR *rp, void *arg)
{
	RR *nsrp;

	if(rp->type != Ta && rp->type != Taaaa)
		return 0;

	for(nsrp = arg; nsrp; nsrp = nsrp->next)
		if(nsrp->type == Tns && rp->owner == nsrp->host)
			return 1;

	return 0;
}

static int
filterauth(RR *rp, void *arg)
{
	Dest *dest;
	RR *nsrp;

	dest = arg;
	nsrp = dest->n;
	if(nsrp == nil)
		return 0;

	if(rp->type == Tsoa && rp->owner != nsrp->owner
	&& !subsume(nsrp->owner->name, rp->owner->name)
	&& strncmp(nsrp->owner->name, "local#", 6) != 0)
		return 1;

	if(rp->type != Tns)
		return 0;

	if(rp->owner != nsrp->owner
	&& !subsume(nsrp->owner->name, rp->owner->name)
	&& strncmp(nsrp->owner->name, "local#", 6) != 0)
		return 1;

	return baddelegation(rp, nsrp, dest->a);
}

static void
reportandfree(RR *l, char *note, Dest *p)
{
	RR *rp;

	while(rp = l){
		l = l->next;
		rp->next = nil;
		if(debug)
			dnslog("ignoring %s from %I/%s: %R",
				note, p->a, p->s->name, rp);
		rrfree(rp);
	}
}

/* returns Answerr (-1) on errors, else number of answers, which can be zero. */
static int
procansw(Query *qp, Dest *p, DNSmsg *mp)
{
	Query nq;
	DN *ndp;
	RR *tp, *soarr;
	int rv, rcode;

	if(mp->an == nil)
		stats.negans++;

	/* get extended rcode */
	rcode = Rok;
	mp->edns = getednsopt(mp, &rcode);
	if(rcode == Rok)
		rcode = getercode(mp);
	rrfreelistptr(&mp->edns);

	/* ignore any error replies */
	switch(rcode){
	case Rformat:
	case Rrefused:
	case Rserver:
	case Rbadvers:
		stats.negserver++;
		freeanswers(mp);
		p->code = Rserver;
		return Answerr;
	}

	/* ignore any bad delegations */
	if((tp = rrremfilter(&mp->ns, filterauth, p)) != 0)
		reportandfree(tp, "bad delegation", p);

	/* remove any soa's from the authority section */
	soarr = rrremtype(&mp->ns, Tsoa);

	/* only nameservers remaining */
	if((tp = rrremtype(&mp->ns, Tns)) != 0){
		reportandfree(mp->ns, "non-nameserver", p);
		mp->ns = tp;
	}

	/* remove answers not related to the question. */
	if((tp = rrremowner(&mp->an, qp->dp)) != 0){
		reportandfree(mp->an, "wrong subject answer", p);
		mp->an = tp;
	}
	if(qp->type != Tall){
		if((tp = rrremtype(&mp->an, qp->type)) != 0){
			reportandfree(mp->an, "wrong type answer", p);
			mp->an = tp;
		}
	}

	/* incorporate answers */
	unique(mp->an);
	unique(mp->ns);
	unique(mp->ar);

	if(mp->an){
		/*
		 * only use cname answer when returned. some dns servers
		 * attach (potential) spam hint address records which poisons
		 * the cache.
		 */
		if((tp = rrremtype(&mp->an, Tcname)) != 0){
			reportandfree(mp->an, "ip in cname answer", p);
			mp->an = tp;
		}
		rrattach(mp->an, (mp->flags & Fauth) != 0);
	}
	if(mp->ar){
		/* restrict hints to address rr's for nameservers only */
		if((tp = rrremfilter(&mp->ar, filterhints, mp->ns)) != 0){
			reportandfree(mp->ar, "hint", p);
			mp->ar = tp;
		}
		rrattach(mp->ar, Notauthoritative);
	}
	if(mp->ns && !cfg.justforw){
		ndp = mp->ns->owner;
		rrattach(mp->ns, Notauthoritative);
	} else {
		ndp = nil;
		rrfreelistptr(&mp->ns);
		mp->nscount = 0;
	}

	/* free the question */
	if(mp->qd) {
		rrfreelistptr(&mp->qd);
		mp->qdcount = 0;
	}

	/*
	 *  Any reply from an authoritative server
	 *  that does not provide more nameservers,
	 *  or a positive reply terminates the search.
	 *  A negative response now also terminates the search.
	 */
	if(mp->an || (mp->flags & Fauth) && mp->ns == nil){
		if(mp->an == nil && rcode == Rname)
			qp->dp->respcode = Rname;
		else
			qp->dp->respcode = Rok;

		/*
		 *  cache any negative responses, free soarr.
		 *  negative responses need not be authoritative:
		 *  they can legitimately come from a cache.
		 */
		if( /* (mp->flags & Fauth) && */ mp->an == nil)
			cacheneg(qp->dp, qp->type, rcode, soarr);
		else
			rrfreelist(soarr);
		return 1;
	} else if (mp->an == nil && rcode == Rname) {
		qp->dp->respcode = Rname;
		/*
		 *  cache negative response.
		 *  negative responses need not be authoritative:
		 *  they can legitimately come from a cache.
		 */
		cacheneg(qp->dp, qp->type, rcode, soarr);
		return 1;
	}
	stats.negnorname++;
	rrfreelist(soarr);

	/*
	 *  if we've been given better name servers, recurse.
	 *  if we're a pure resolver, don't recurse, we have
	 *  to forward to a fixed set of named servers.
	 */
	if(mp->ns == nil || cfg.resolver && cfg.justforw)
		return Answnone;
	tp = rrlookup(ndp, Tns, NOneg);
	if(contains(qp->nsrp, tp)){
		rrfreelist(tp);
		return Answnone;
	}

	initquery(&nq, qp->dp, qp->type, qp->req, qp->depth+1);
	rv = netqueryns(&nq, tp);
	exitquery(&nq);

	return rv;
}

static int
writenet(Query *qp, int medium, int fd, uchar *pkt, int len, Dest *p)
{
	uchar tmp[2];
	int rv;

	logrequest(qp->req->id, qp->depth, "send", p->a, p->s->name,
		qp->dp->name, qp->type);

	rv = -1;
	switch (medium) {
	case Udp:
		/* fill in UDP destination addr & send it */
		ipmove(pkt, p->a);
		if (write(fd, pkt, len+Udphdrsize) != len+Udphdrsize)
			warning("sending udp msg to %I/%s: %r", p->a, p->s->name);
		else {
			stats.qsentudp++;
			rv = 0;
		}
		break;
	case Tcp:
		tmp[0] = pkt[Udphdrsize-2], pkt[Udphdrsize-2] = len >> 8;
		tmp[1] = pkt[Udphdrsize-1], pkt[Udphdrsize-1] = len;
		len += 2;
		if (write(fd, pkt + Udphdrsize-2, len) != len)
			warning("sending tcp msg to %I/%s: %r", p->a, p->s->name);
		else {
			stats.qsenttcp++;
			rv = 0;
		}
		pkt[Udphdrsize-2] = tmp[0];
		pkt[Udphdrsize-1] = tmp[1];
		break;
	}
	return rv;
}

/*
 * send a query via tcp or tls to a single address
 * and read the answer(s) into mp->an.
 */
static int
tcpquery(Query *qp, uchar *pkt, int len, Dest *p, uvlong endms, DNSmsg *mp, int tls)
{
	static struct {
		QLock lk;
		struct {
			uvlong when;
			char *dest;
			int fd;
		} l[Maxtcpfree];
	} tcpfree;

	char buf[NETPATHLEN];
	int fd, rv, i, retry;
	long ms;

	memset(mp, 0, sizeof *mp);

	procsetname("%s query to %I/%s for %s %s", tls ? "tls" : "tcp", p->a, p->s->name,
		qp->dp->name, rrname(qp->type, buf, sizeof buf));

	snprint(buf, sizeof buf, "%s/tcp!%I!%s", mntpt, p->a, tls ? "853" : "53");

	fd = -1;
	qlock(&tcpfree.lk);
	for(i = 0; i < nelem(tcpfree.l); i++){
		if(tcpfree.l[i].dest == nil || tcpfree.l[i].fd < 0)
			continue;
		if(strcmp(tcpfree.l[i].dest, buf) != 0)
			continue;
		if(nowms - tcpfree.l[i].when < Maxtcpresuetm){
			fd = tcpfree.l[i].fd;
			tcpfree.l[i].fd = -1;
			break;
		}
	}
	qunlock(&tcpfree.lk);
	if(fd >= 0){
		retry = 1;
		goto Connected;
	}
Retry:
	retry = 0;
	ms = (long)(endms - nowms);
	if(ms < Minreqtm)
		return -1;	/* takes too long */
	if(ms > Maxtcpdialtm)
		ms = Maxtcpdialtm;

	alarm(ms);
	fd = dial(buf, nil, nil, nil);
	if(fd < 0){
		alarm(0);
		dnslog("%d: can't dial %s for %I/%s: %r",
			qp->req->id, buf, p->a, p->s->name);
		return -1;
	}
	if(tls){
		static char thumbfile[] = "/sys/lib/tls/dns";
		Thumbprint *thumb;
		TLSconn conn;

		memset(&conn, 0, sizeof conn);
		rv = tlsClient(fd, &conn);
		alarm(0);
		if(rv >= 0){
			fd = rv;

			thumb = initThumbprints(thumbfile, nil, "x509");
			if(thumb == nil){
				dnslog("%d: can't load thumb file %s: %r",
					qp->req->id, thumbfile);
				rv = -1;
			} else {
				if(!okCertificate(conn.cert, conn.certlen, thumb)){
					dnslog("%d: invalid fingerprint for %s; echo 'x509 %r' >>%s",
						qp->req->id, buf, thumbfile);
					rv = -1;
				}
				freeThumbprints(thumb);
			}
		}
		free(conn.cert);
		free(conn.sessionID);
		if(rv < 0){
			close(fd);
			return -1;
		}
	} else {
		alarm(0);
	}

Connected:
	rv = writenet(qp, Tcp, fd, pkt, len, p);
	if(rv == 0){
		timems();	/* account for time dialing and sending */
		rv = readreply(qp, Tcp, fd, endms, mp, pkt);
	}

	if(rv < 0){
		close(fd);
		if(retry)
			goto Retry;
		return rv;
	}

	qlock(&tcpfree.lk);
	if(tcpfree.l[nelem(tcpfree.l)-1].dest != nil){
		close(tcpfree.l[nelem(tcpfree.l)-1].fd);
		free(tcpfree.l[nelem(tcpfree.l)-1].dest);
	}
	memmove(tcpfree.l + 1, tcpfree.l, sizeof(tcpfree.l[0])*(nelem(tcpfree.l)-1));
	tcpfree.l[0].when = nowms;
	tcpfree.l[0].fd = fd;
	tcpfree.l[0].dest = estrdup(buf);
	qunlock(&tcpfree.lk);

	return rv;
}

static int
tlsqueryns(Query *qp, uchar *pkt, int len)
{
	Dest dest[Maxdest], *p;
	int rv, n;
	uvlong endms;
	DNSmsg m;

	/* populates dest with v4 and v6 addresses. */
	n = 0;
	n = serveraddrs(qp, dest, n, Ta);
	n = serveraddrs(qp, dest, n, Taaaa);
	for(p = dest; p < dest+n; p++){
		endms = timems() + Maxtcpdialtm + 500;
		if(endms > qp->req->aborttime)
			endms = qp->req->aborttime;
		if(tcpquery(qp, pkt, len, p, endms, &m, 1) == 0){
			/* free or incorporate RRs in m */
			rv = procansw(qp, p, &m);
			if(rv > Answnone)
				return rv;
		}
	}

	/* if all servers returned failure, propagate it */
	qp->dp->respcode = Rserver;
	for(p = dest; p < dest+n; p++)
		if(p->code != Rserver)
			qp->dp->respcode = Rok;

	return Answnone;
}

/*
 *  query name servers.  fill in pkt with on-the-wire representation of a
 *  DNSmsg derived from qp. if the name server returns a pointer to another
 *  name server, recurse.
 */
static int
udpqueryns(Query *qp, int fd, uchar *pkt, int len)
{
	Dest dest[Maxdest], *edest, *p, *np;
	int ndest, replywaits, rv, n;
	uchar srcip[IPaddrlen];
	char buf[32];
	uvlong endms;
	DNSmsg m;

	/* no destination yet */
	edest = dest;

	/*
	 *  transmit udp requests and wait for answers.
	 *  at most Maxtrans attempts to each address.
	 *  each cycle send one more message than the previous.
	 *  retry a query via tcp if its response is truncated.
	 */
	for(ndest = 2; ndest < Maxdest; ndest += 2){
		endms = nowms;
		if((long)(qp->req->aborttime - nowms) < Minreqtm)
			break;

		/*
		 * get a nameserver address if we need one.
		 * we're to transmit to more destinations than we currently have,
		 * so get another.
		 */
		n = edest - dest;
		if (n < ndest) {
			/* populates dest with v4 and v6 addresses. */
			n = serveraddrs(qp, dest, n, Ta);
			n = serveraddrs(qp, dest, n, Taaaa);
			edest = dest + n;
		}

		n = 0;
		for(p = dest; p < edest && p < &dest[ndest]; p++){
			/* skip destinations we've finished with */
			if(p->nx >= Maxtrans)
				continue;
			/* exponential backoff of requests */
			if((1UL<<p->nx) > ndest)
				continue;
			p->nx++;
			if(writenet(qp, Udp, fd, pkt, len, p) < 0)
				continue;
			n++;
		}
		/* nothing left to send to */
		if (n == 0)
			break;

		/* set the timeout for replies */
		endms += 500;
		if(endms > qp->req->aborttime)
			endms = qp->req->aborttime;

		procsetname("reading replies from %I...: %s %s from %s",
			pkt, qp->dp->name,
			rrname(qp->type, buf, sizeof buf), qp->req->from);

		for(replywaits = 0; replywaits < ndest; replywaits++){
			/* read udp answer into m, fill srcip */
			if(readreply(qp, Udp, fd, endms, &m, srcip) < 0)
				break;

			/* find responder */
			for(p = dest; p < edest; p++)
				if(ipcmp(p->a, srcip) == 0)
					break;
			if(p >= edest){
				freeanswers(&m);
				continue;
			}

			/* if response was truncated, try tcp */
			if(m.flags & Ftrunc){
				freeanswers(&m);
				if(tcpquery(qp, pkt, len, p, endms, &m, 0) < 0)
					break;	/* failed via tcp too */
				if(m.flags & Ftrunc){
					freeanswers(&m);
					break;
				}
			}

			/* remove all addrs of responding server from list */
			for(np = dest; np < edest; np++)
				if(np->s == p->s)
					np->nx = Maxtrans;

			/* free or incorporate RRs in m */
			rv = procansw(qp, p, &m);
			if(rv > Answnone)
				return rv;
		}
	}

	/* if all servers returned failure, propagate it */
	qp->dp->respcode = Rserver;
	for(p = dest; p < edest; p++)
		if(p->code != Rserver)
			qp->dp->respcode = Rok;

	return Answnone;
}

static int
doquery(Query *qp)
{
	int fd, rv, len, flag;
	uchar *pkt;
	RR *rp;

	pkt = emalloc(Maxudp+Udphdrsize);
	/* prepare server RR's for incremental lookup */
	for(rp = qp->nsrp; rp; rp = rp->next)
		rp->marker = 0;
	/* request recursion only for local/override dns servers */
	flag = Oquery;
	if(strncmp(qp->nsrp->owner->name, "local#", 6) == 0
	|| strncmp(qp->nsrp->owner->name, "override#", 9) == 0)
		flag |= Frecurse;
	/* pack request into a udp message */
	qp->id = rand();
	len = mkreq(qp->dp, qp->type, pkt, flag, qp->id);
	if(strncmp(qp->nsrp->owner->name, "local#dot#server", 16) == 0
	|| strncmp(qp->nsrp->owner->name, "override#dot#server", 16) == 0){
		rv = tlsqueryns(qp, pkt, len);
	} else {
		/*
		 * in principle we could use a single descriptor for a udp port
		 * to send all queries and receive all the answers to them,
		 * but we'd have to sort out the answers by dns-query id.
		 */
		fd = udpport(mntpt);
		if (fd < 0) {
			dnslog("%d: can't get udpport for %s query of name %s: %r",
				qp->req->id, mntpt, qp->dp->name);
			rv = -1;
			goto Out;
		}
		rv = udpqueryns(qp, fd, pkt, len);
		close(fd);
	}
Out:
	free(pkt);
	return rv;
}

/*
 * look up (qp->dp->name, qp->type) rr in dns,
 * using nameservers in qp->nsrp.
 */
static int
netquery(Query *qp)
{
	if(qp->depth > 12)			/* in a recursive loop? */
		return Answnone;

	slave(qp->req);

	/*
	 * slave might have forked.  if so, the parent process longjmped to
	 * req->mret; we're usually the child slave, but if there are too
	 * many children already, we're still the same process. under no
	 * circumstances block the 9p loop.
	 */
	if(!qp->req->isslave && strcmp(qp->req->from, "9p") == 0)
		return Answnone;

	return doquery(qp);
}