shithub: libvpx

Download patch

ref: fea3556e20f63ed1a03daa45804f724e70705aa0
parent: 2e0d55314c28c546d98595e4b3799ca451a5a777
author: Johann <johannkoenig@google.com>
date: Thu Feb 9 07:38:31 EST 2012

Fix variance overflow

In the variance calculations the difference is summed and later squared.
When the sum exceeds sqrt(2^31) the value is treated as a negative when
it is shifted which gives incorrect results.

To fix this we cast the result of the multiplication as unsigned.

The alternative fix is to shift sum down by 4 before multiplying.
However that will reduce precision.

For 16x16 blocks the maximum sum is 65280 and sqrt(2^31) is 46340 (and
change).

PPC change is untested.

Change-Id: I1bad27ea0720067def6d71a6da5f789508cec265

--- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
@@ -144,7 +144,7 @@
     ldr     r6, [sp, #40]       ; get address of sse
     mul     r0, r8, r8          ; sum * sum
     str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
 
     ldmfd   sp!, {r4-r12, pc}
 
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@@ -169,7 +169,7 @@
     ldr     r6, [sp, #40]       ; get address of sse
     mul     r0, r8, r8          ; sum * sum
     str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
 
     ldmfd   sp!, {r4-r12, pc}
 
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@@ -210,7 +210,7 @@
     ldr     r6, [sp, #40]       ; get address of sse
     mul     r0, r8, r8          ; sum * sum
     str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
 
     ldmfd   sp!, {r4-r12, pc}
 
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -171,7 +171,7 @@
     ldr     r6, [sp, #40]       ; get address of sse
     mul     r0, r8, r8          ; sum * sum
     str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
 
     ldmfd   sp!, {r4-r12, pc}
 
--- a/vp8/encoder/arm/neon/variance_neon.asm
+++ b/vp8/encoder/arm/neon/variance_neon.asm
@@ -77,14 +77,14 @@
     ;vmov.32        r1, d1[0]
     ;mul            r0, r0, r0
     ;str            r1, [r12]
-    ;sub            r0, r1, r0, asr #8
+    ;sub            r0, r1, r0, lsr #8
 
-    ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
-    ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
+    ; while sum is signed, sum * sum is always positive and must be treated as
+    ; unsigned to avoid propagating the sign bit.
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     bx              lr
@@ -145,8 +145,8 @@
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #7
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #7
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     bx              lr
@@ -200,8 +200,8 @@
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #7
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #7
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     bx              lr
@@ -265,8 +265,8 @@
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #6
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #6
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     bx              lr
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -405,8 +405,8 @@
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [r6]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
 
     add             sp, sp, #528
     vmov.32         r0, d0[0]                   ;return
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
@@ -112,8 +112,8 @@
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     pop             {pc}
@@ -208,8 +208,8 @@
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     pop             {pc}
@@ -327,8 +327,8 @@
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     pop             {pc}
@@ -560,8 +560,8 @@
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
 
     add             sp, sp, #256
     vmov.32         r0, d0[0]                   ;return
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -206,8 +206,8 @@
 
     vmull.s32       q5, d0, d0
     vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #6
-    vsub.s32        d0, d1, d10
+    vshr.u32        d10, d10, #6
+    vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
     pop             {r4-r5, pc}
--- a/vp8/encoder/ppc/variance_altivec.asm
+++ b/vp8/encoder/ppc/variance_altivec.asm
@@ -98,7 +98,7 @@
     stw     r4, 0(r7)           ;# sse
 
     mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, \DS         ;# (sum*sum) >> DS
+    srlwi   r3, r3, \DS         ;# (sum*sum) >> DS
     subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)
 .endm
 
@@ -142,7 +142,7 @@
     stw     r4, 0(r7)           ;# sse
 
     mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, \DS         ;# (sum*sum) >> 8
+    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
     subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
 .endm
 
@@ -367,7 +367,7 @@
     stw     r4, 0(r7)           ;# sse
 
     mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, 4           ;# (sum*sum) >> 4
+    srlwi   r3, r3, 4           ;# (sum*sum) >> 4
     subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)
 
     epilogue
--- a/vp8/encoder/ppc/variance_subpixel_altivec.asm
+++ b/vp8/encoder/ppc/variance_subpixel_altivec.asm
@@ -157,7 +157,7 @@
     stw     r4, 0(r9)           ;# sse
 
     mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, \DS         ;# (sum*sum) >> 8
+    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
     subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
 .endm
 
--- a/vp8/encoder/variance_c.c
+++ b/vp8/encoder/variance_c.c
@@ -75,7 +75,7 @@
 
     variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
     *sse = var;
-    return (var - ((avg * avg) >> 8));
+    return (var - ((unsigned int)(avg * avg) >> 8));
 }
 
 unsigned int vp8_variance8x16_c(
@@ -91,7 +91,7 @@
 
     variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
     *sse = var;
-    return (var - ((avg * avg) >> 7));
+    return (var - ((unsigned int)(avg * avg) >> 7));
 }
 
 unsigned int vp8_variance16x8_c(
@@ -107,7 +107,7 @@
 
     variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
     *sse = var;
-    return (var - ((avg * avg) >> 7));
+    return (var - ((unsigned int)(avg * avg) >> 7));
 }
 
 
@@ -124,7 +124,7 @@
 
     variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
     *sse = var;
-    return (var - ((avg * avg) >> 6));
+    return (var - ((unsigned int)(avg * avg) >> 6));
 }
 
 unsigned int vp8_variance4x4_c(
@@ -140,7 +140,7 @@
 
     variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
     *sse = var;
-    return (var - ((avg * avg) >> 4));
+    return (var - ((unsigned int)(avg * avg) >> 4));
 }
 
 
--- a/vp8/encoder/x86/variance_mmx.c
+++ b/vp8/encoder/x86/variance_mmx.c
@@ -91,7 +91,7 @@
 
     vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
     *sse = var;
-    return (var - ((avg * avg) >> 4));
+    return (var - ((unsigned int)(avg * avg) >> 4));
 
 }
 
@@ -108,7 +108,7 @@
     vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
     *sse = var;
 
-    return (var - ((avg * avg) >> 6));
+    return (var - ((unsigned int)(avg * avg) >> 6));
 
 }
 
@@ -153,7 +153,7 @@
     var = sse0 + sse1 + sse2 + sse3;
     avg = sum0 + sum1 + sum2 + sum3;
     *sse = var;
-    return (var - ((avg * avg) >> 8));
+    return (var - ((unsigned int)(avg * avg) >> 8));
 }
 
 unsigned int vp8_variance16x8_mmx(
@@ -172,7 +172,7 @@
     var = sse0 + sse1;
     avg = sum0 + sum1;
     *sse = var;
-    return (var - ((avg * avg) >> 7));
+    return (var - ((unsigned int)(avg * avg) >> 7));
 
 }
 
@@ -194,7 +194,7 @@
     avg = sum0 + sum1;
     *sse = var;
 
-    return (var - ((avg * avg) >> 7));
+    return (var - ((unsigned int)(avg * avg) >> 7));
 
 }
 
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -148,7 +148,7 @@
 
     vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
     *sse = var;
-    return (var - ((avg * avg) >> 4));
+    return (var - ((unsigned int)(avg * avg) >> 4));
 
 }
 
@@ -165,7 +165,7 @@
 
     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
     *sse = var;
-    return (var - ((avg * avg) >> 6));
+    return (var - ((unsigned int)(avg * avg) >> 6));
 
 }
 
@@ -220,7 +220,7 @@
     var = sse0 + sse1;
     avg = sum0 + sum1;
     *sse = var;
-    return (var - ((avg * avg) >> 7));
+    return (var - ((unsigned int)(avg * avg) >> 7));
 
 }
 
@@ -241,7 +241,7 @@
     var = sse0 + sse1;
     avg = sum0 + sum1;
     *sse = var;
-    return (var - ((avg * avg) >> 7));
+    return (var - ((unsigned int)(avg * avg) >> 7));
 
 }
 
--- a/vp8/encoder/x86/variance_ssse3.c
+++ b/vp8/encoder/x86/variance_ssse3.c
@@ -112,7 +112,7 @@
     }
 
     *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
 }
 
 unsigned int vp8_sub_pixel_variance16x8_ssse3
@@ -161,5 +161,5 @@
     }
 
     *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 7));
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
 }