shithub: femtolisp

ref: 6c42012d3d227941cfec46b77768b2d3e5891dea
dir: /bitvector-ops.c/

View raw version
#include "llt.h"

#define ONES32 ((uint32_t)0xffffffffUL)

// greater than this # of words we use malloc instead of alloca
#define MALLOC_CUTOFF 2000

static inline
uint32_t count_bits(uint32_t b)
{
	b = b - ((b>>1)&0x55555555);
	b = ((b>>2)&0x33333333) + (b&0x33333333);
	b = ((b>>4)+b)&0x0f0f0f0f;
	b += (b>>8);
	b += (b>>16);
	return b & 0x3f;
}

uint32_t
bitreverse(uint32_t x)
{
	uint32_t m;

#ifdef __INTEL_COMPILER
	x = _bswap(x);
#else
	x = (x >> 16)      | (x << 16);       m = 0xff00ff00;
	x = ((x & m) >> 8) | ((x & ~m) << 8);
#endif
	m = 0xf0f0f0f0;
	x = ((x & m) >> 4) | ((x & ~m) << 4); m = 0xcccccccc;
	x = ((x & m) >> 2) | ((x & ~m) << 2); m = 0xaaaaaaaa;
	x = ((x & m) >> 1) | ((x & ~m) << 1);

	return x;
}

// shift all bits in a long bit vector
// n is # of int32s to consider, s is shift distance
// lowest bit-index is bit 0 of word 0
// TODO: handle boundary case of shift distance >= data size?
void
bitvector_shr(uint32_t *b, size_t n, uint32_t s)
{
	uint32_t i;
	if(s == 0 || n == 0)
		return;
	i = s >> 5;
	if(i){
		n -= i;
		memmove(b, &b[i], n*4);
		memset(&b[n], 0, i*4);
		s &= 31;
	}
	for(i = 0; i < n-1; i++)
		b[i] = (b[i] >> s) | (b[i+1] << (32-s));
	b[i] >>= s;
}

// out-of-place version, good for re-aligning a strided submatrix to
// linear representation when a copy is needed
// assumes that dest has the same amount of space as source, even if it
// wouldn't have been necessary to hold the shifted bits
void
bitvector_shr_to(uint32_t *dest, uint32_t *b, size_t n, uint32_t s)
{
	uint32_t i, j;
	if(n == 0)
		return;
	if(s == 0){
		memmove(dest, b, n*4);
		return;
	}
	j = s >> 5;
	if(j){
		n -= j;
		memset(&dest[n], 0, j*4);
		s &= 31;
		b = &b[j];
	}
	for(i = 0; i < n-1; i++)
		dest[i] = (b[i] >> s) | (b[i+1] << (32-s));
	dest[i] = b[i]>>s;
}

void
bitvector_shl(uint32_t *b, size_t n, uint32_t s)
{
	uint32_t i, scrap = 0, temp;
	if(s == 0 || n == 0)
		return;
	i = s >> 5;
	if(i){
		n -= i;
		memmove(&b[i], b, n*4);
		memset(b, 0, i*4);
		s &= 31;
		b = &b[i];
	}
	for(i = 0; i < n; i++){
		temp = (b[i] << s) | scrap;
		scrap = b[i] >> (32-s);
		b[i] = temp;
	}
}

// if dest has more space than source, set scrap to true to keep the
// top bits that would otherwise be shifted out
void
bitvector_shl_to(uint32_t *dest, uint32_t *b, size_t n, uint32_t s, int scrap)
{
	uint32_t i, j, sc = 0;
	if(n == 0)
		return;
	if(s == 0){
		memmove(dest, b, n*4);
		return;
	}
	j = s >> 5;
	if(j){
		n -= j;
		memset(dest, 0, j*4);
		s &= 31;
		dest = &dest[j];
	}
	for(i = 0; i < n; i++){
		dest[i] = (b[i] << s) | sc;
		sc = b[i] >> (32-s);
	}
	if(scrap)
		dest[i] = sc;
}

// set nbits to c, starting at given bit offset
// assumes offs < 32
void
bitvector_fill(uint32_t *b, uint32_t offs, uint32_t c, uint32_t nbits)
{
	uint32_t i, nw, tail, mask;

	if(nbits == 0)
		return;
	nw = (offs+nbits+31)>>5;

	if(nw == 1){
		mask = (lomask(nbits)<<offs);
		if(c)
			b[0] |= mask;
		else
			b[0] &= ~mask;
		return;
	}

	mask = lomask(offs);
	if(c)
		b[0] |= ~mask;
	else
		b[0] &= mask;

	mask = c ? ONES32 : 0;

	for(i = 1; i < nw-1; i++)
		b[i] = mask;

	tail = (offs+nbits) & 31;
	if(tail == 0)
		b[i] = mask;
	else{
		mask = lomask(tail);
		if(c)
			b[i] |= mask;
		else
			b[i] &= ~mask;
	}
}

void
bitvector_not(uint32_t *b, uint32_t offs, uint32_t nbits)
{
	uint32_t i, nw, tail, mask;

	if(nbits == 0)
		return;
	nw = (offs+nbits+31)>>5;

	if(nw == 1){
		mask = lomask(nbits)<<offs;
		b[0] ^= mask;
		return;
	}

	mask = ~lomask(offs);
	b[0] ^= mask;

	for(i = 1; i < nw-1; i++)
		b[i] = ~b[i];

	tail = (offs+nbits)&31;
	if(tail == 0)
		b[i] = ~b[i];
	else{
		mask = lomask(tail);
		b[i] ^= mask;
	}
}

// constant-space bit vector copy in a single pass, with arbitrary
// offsets and lengths. to get this right, there are 16 cases to handle!
#define BITVECTOR_COPY_OP(name, OP) \
void \
bitvector_##name(uint32_t *dest, uint32_t doffs, uint32_t *src, uint32_t soffs, uint32_t nbits) \
{ \
	uint32_t i, s, nw, tail, snw, mask, scrap; \
	if(nbits == 0) \
		return; \
	nw = (doffs+nbits+31)>>5; \
	if(soffs == doffs){ \
		if(nw == 1){ \
			mask = (lomask(nbits)<<doffs); \
			dest[0] = (dest[0] & ~mask) | (OP(src[0]) & mask); \
			return; \
		} \
		mask = ~lomask(doffs); \
		dest[0] = (dest[0] & ~mask) | (OP(src[0]) & mask); \
		for(i = 1; i < nw-1; i++) \
			dest[i] = OP(src[i]); \
		tail = (doffs+nbits)&31; \
		if(tail == 0) \
			dest[i] = src[i]; \
		else { \
			mask = lomask(tail); \
			dest[i] = (dest[i] & ~mask) | (OP(src[i]) & mask); \
		} \
		return; \
	} \
	snw = (soffs+nbits+31)>>5; \
	if(soffs < doffs){ \
		s = doffs-soffs; \
		if(nw == 1){ \
			mask = lomask(nbits) << doffs; \
			dest[0] = (dest[0] & ~mask) | ((OP(src[0])<<s) & mask); \
			return; \
		} \
		mask = ~lomask(doffs); \
		dest[0] = (dest[0] & ~mask) | ((OP(src[0])<<s) & mask); \
		scrap = OP(src[0])>>(32-s); \
		for(i = 1; i < snw-1; i++){ \
			dest[i] = (OP(src[i])<<s) | scrap; \
			scrap = OP(src[i])>>(32-s); \
		} \
		tail = (doffs+nbits)&31; \
		mask = tail ? lomask(tail) : ONES32; \
		if(snw == nw) \
			dest[i] = (dest[i] & ~mask) | (((OP(src[i])<<s)|scrap) & mask); \
		else{ /* snw < nw */ \
			if(snw == 1) \
				dest[i] = (dest[i] & ~mask) | (((OP(src[i])<<s) | scrap) & mask); \
			else{ \
				dest[i] = (OP(src[i])<<s) | scrap; \
				scrap = OP(src[i])>>(32-s); \
				i++; \
				dest[i] = (dest[i] & ~mask) | (scrap & mask); \
			} \
		} \
	}else{ \
		s = soffs-doffs; \
		if(snw == 1){ \
			mask = (lomask(nbits)<<doffs); \
			dest[0] = (dest[0] & ~mask) | ((OP(src[0])>>s) & mask); \
			return; \
		} \
		if(nw == 1){ \
			mask = (lomask(nbits)<<doffs); \
			dest[0] = (dest[0] & ~mask) | \
				(((OP(src[0])>>s)|(OP(src[1])<<(32-s))) & mask); \
			return; \
		} \
		mask = ~lomask(doffs); \
		dest[0] = (dest[0] & ~mask) | (((OP(src[0])>>s)|(OP(src[1])<<(32-s))) & mask); \
		for(i = 1; i < nw-1; i++) \
			dest[i] = (OP(src[i])>>s) | (OP(src[i+1])<<(32-s)); \
		tail = (doffs+nbits)&31; \
		mask = tail ? lomask(tail) : ONES32; \
		if(snw == nw){ \
			dest[i] = (dest[i] & ~mask) | ((OP(src[i])>>s) & mask); \
		} \
		else /* snw > nw */ { \
			dest[i] = (dest[i] & ~mask) | \
				(((OP(src[i])>>s)|(OP(src[i+1])<<(32-s))) & mask); \
		} \
	} \
}

#define BV_COPY(a) (a)
#define BV_NOT(a) (~(a))
BITVECTOR_COPY_OP(copy, BV_COPY)
BITVECTOR_COPY_OP(not_to, BV_NOT)

// copy from source to dest while reversing bit-order
// assumes dest offset == 0
// assumes source and dest don't overlap
// assumes offset < 32
void
bitvector_reverse_to(uint32_t *dest, uint32_t *src, uint32_t soffs, uint32_t nbits)
{
	uint32_t i, nw, tail;

	if(nbits == 0)
		return;

	nw = (soffs+nbits+31)>>5;
	// first, reverse the words while reversing bit order within each word
	for(i = 0; i < nw/2; i++){
		dest[i] = bitreverse(src[nw-i-1]);
		dest[nw-i-1] = bitreverse(src[i]);
	}
	if(nw&0x1)
		dest[i] = bitreverse(src[i]);

	tail = (soffs+nbits)&31;
	if(tail)
		bitvector_shr(dest, nw, 32-tail);
}

void
bitvector_reverse(uint32_t *b, uint32_t offs, uint32_t nbits)
{
	uint32_t i, nw, tail, *temp, a[MALLOC_CUTOFF];

	if(nbits == 0)
		return;

	nw = (offs+nbits+31)>>5;
	temp = (nw > MALLOC_CUTOFF) ? malloc(nw*4) : a;
	for(i = 0; i < nw/2; i++){
		temp[i]	= bitreverse(b[nw-i-1]);
		temp[nw-i-1] = bitreverse(b[i]);
	}
	if(nw & 1)
		temp[i] = bitreverse(b[i]);

	tail = (offs+nbits)&31;
	bitvector_copy(b, offs, temp, (32-tail)&31, nbits);
	if(nw > MALLOC_CUTOFF)
		free(temp);
}

uint64_t
bitvector_count(uint32_t *b, uint32_t offs, uint64_t nbits)
{
	size_t i, nw;
	uint32_t ntail;
	uint64_t ans;

	if(nbits == 0)
		return 0;
	nw = ((uint64_t)offs+nbits+31)>>5;

	if(nw == 1)
		return count_bits(b[0] & (lomask(nbits)<<offs));

	ans = count_bits(b[0]>>offs);  // first end cap

	for(i = 1; i < nw-1; i++)
		ans += count_bits(b[i]);

	ntail = (offs + (uint32_t)nbits) & 31;
	ans += count_bits(b[i] & (ntail > 0 ? lomask(ntail) : ONES32));  // last end cap

	return ans;
}

uint32_t
bitvector_any0(uint32_t *b, uint32_t offs, uint32_t nbits)
{
	uint32_t i, nw, tail, mask;

	if(nbits == 0)
		return 0;
	nw = (offs+nbits+31)>>5;

	if(nw == 1){
		mask = (lomask(nbits)<<offs);
		if((b[0] & mask) != mask)
			return 1;
		return 0;
	}

	mask = ~lomask(offs);
	if((b[0] & mask) != mask)
		return 1;

	for(i = 1; i < nw-1; i++)
		if(b[i] != ONES32)
			return 1;

	tail = (offs+nbits)&31;
	if(tail == 0)
		return b[i] != ONES32;
	mask = lomask(tail);
	return (b[i] & mask) != mask;
}

uint32_t
bitvector_any1(uint32_t *b, uint32_t offs, uint32_t nbits)
{
	uint32_t i, nw, tail, mask;

	if(nbits == 0)
		return 0;
	nw = (offs+nbits+31)>>5;

	if(nw == 1){
		mask = lomask(nbits)<<offs;
		return (b[0] & mask) != 0;
	}

	mask = ~lomask(offs);
	if((b[0] & mask) != 0)
		return 1;

	for(i = 1; i < nw-1; i++){
		if(b[i] != 0)
			return 1;
	}

	tail = (offs+nbits)&31;
	if(tail == 0)
		return b[i] != 0;
	return (b[i] & lomask(tail)) != 0;
}

static void
adjust_offset_to(uint32_t *dest, uint32_t *src, uint32_t nw, uint32_t soffs, uint32_t newoffs)
{
	if(newoffs > soffs)
		bitvector_shl_to(dest, src, nw, newoffs-soffs, 1);
	else
		bitvector_shr_to(dest, src, nw, soffs-newoffs);
}

#define BITVECTOR_BINARY_OP_TO(opname, OP) \
void \
bitvector_##opname##_to(uint32_t *dest, uint32_t doffs, uint32_t *a, uint32_t aoffs, uint32_t *b, uint32_t boffs, uint32_t nbits) \
{ \
	uint32_t nw = (doffs+nbits+31)>>5; \
	uint32_t atmp[MALLOC_CUTOFF+1]; \
	uint32_t *temp = nw>MALLOC_CUTOFF ? malloc((nw+1)*4) : atmp; \
	uint32_t i, anw, bnw; \
	if(aoffs == boffs){ \
		anw = (aoffs+nbits+31)>>5; \
	}else if(aoffs == doffs){ \
		bnw = (boffs+nbits+31)>>5; \
		adjust_offset_to(temp, b, bnw, boffs, aoffs); \
		b = temp; \
		anw = nw; \
	}else{ \
		anw = (aoffs+nbits+31)>>5; \
		bnw = (boffs+nbits+31)>>5; \
		adjust_offset_to(temp, a, anw, aoffs, boffs); \
		a = temp; \
		aoffs = boffs; \
		anw = bnw; \
	} \
	for(i = 0; i < anw; i++) \
		temp[i] = OP(a[i], b[i]); \
	bitvector_copy(dest, doffs, temp, aoffs, nbits); \
	if(nw>MALLOC_CUTOFF) \
		free(temp); \
}

#define BV_AND(a, b) ((a)&(b))
#define BV_OR(a, b) ((a)|(b))
#define BV_XOR(a, b) ((a)^(b))
BITVECTOR_BINARY_OP_TO(and, BV_AND)
BITVECTOR_BINARY_OP_TO(or, BV_OR)
BITVECTOR_BINARY_OP_TO(xor, BV_XOR)