shithub: dav1d

Download patch

ref: fa1b265142e1409a986f01bd7abe115b308c1028
parent: 44d0de41d478b6b41a1ebbf1de012caa8d75cca0
author: Henrik Gramner <gramner@twoorioles.com>
date: Thu Apr 11 19:20:18 EDT 2019

x86-64: Add msac_decode_symbol_adapt SSE2 asm

Also make various minor optimizations/style fixes to the MSAC C functions.

--- a/src/cdf.c
+++ b/src/cdf.c
@@ -813,7 +813,7 @@
     AOM_CDF4(4096, 11264, 19328)
 };
 
-static const uint16_t default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 1] = {
+static const uint16_t default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 1 + 2] = {
     {
         { AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244,
                     24189, 28165, 29093, 30466) },
--- a/src/cdf.h
+++ b/src/cdf.h
@@ -34,11 +34,13 @@
 #include "src/ref.h"
 #include "src/thread_data.h"
 
+/* Buffers padded to [8] or [16] for SIMD where needed. */
+
 typedef struct CdfModeContext {
-    uint16_t y_mode[4][N_INTRA_PRED_MODES + 1];
+    uint16_t y_mode[4][N_INTRA_PRED_MODES + 1 + 2];
     uint16_t use_filter_intra[N_BS_SIZES][2];
     uint16_t filter_intra[5 + 1];
-    uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1];
+    uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1 + 1];
     uint16_t angle_delta[8][8];
     uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1];
     uint16_t newmv_mode[6][2];
@@ -66,7 +68,7 @@
     uint16_t txtp_intra[3][N_TX_SIZES][N_INTRA_PRED_MODES][N_TX_TYPES + 1];
     uint16_t skip[3][2];
     uint16_t skip_mode[3][2];
-    uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1];
+    uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1 + 5];
     uint16_t seg_pred[3][2];
     uint16_t seg_id[3][DAV1D_MAX_SEGMENTS + 1];
     uint16_t cfl_sign[8 + 1];
@@ -88,12 +90,12 @@
 typedef struct CdfCoefContext {
     uint16_t skip[N_TX_SIZES][13][2];
     uint16_t eob_bin_16[2][2][6];
-    uint16_t eob_bin_32[2][2][7];
+    uint16_t eob_bin_32[2][2][7 + 1];
     uint16_t eob_bin_64[2][2][8];
     uint16_t eob_bin_128[2][2][9];
-    uint16_t eob_bin_256[2][2][10];
-    uint16_t eob_bin_512[2][2][11];
-    uint16_t eob_bin_1024[2][2][12];
+    uint16_t eob_bin_256[2][2][10 + 6];
+    uint16_t eob_bin_512[2][2][11 + 5];
+    uint16_t eob_bin_1024[2][2][12 + 4];
     uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2];
     uint16_t eob_base_tok[N_TX_SIZES][2][4][4];
     uint16_t base_tok[N_TX_SIZES][2][41][5];
@@ -102,7 +104,7 @@
 } CdfCoefContext;
 
 typedef struct CdfMvComponent {
-    uint16_t classes[11 + 1];
+    uint16_t classes[11 + 1 + 4];
     uint16_t class0[2];
     uint16_t classN[10][2];
     uint16_t class0_fp[2][4 + 1];
@@ -119,7 +121,7 @@
 
 typedef struct CdfContext {
     CdfModeContext m;
-    uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1];
+    uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1 + 2];
     CdfCoefContext coef;
     CdfMvContext mv, dmv;
 } CdfContext;
--- a/src/decode.c
+++ b/src/decode.c
@@ -80,15 +80,15 @@
     const Dav1dFrameContext *const f = t->f;
     const int have_hp = f->frame_hdr->hp;
     const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
-    const int cl = dav1d_msac_decode_symbol_adapt(&ts->msac,
-                                                  mv_comp->classes, 11);
+    const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+                                                    mv_comp->classes, 11);
     int up, fp, hp;
 
     if (!cl) {
         up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
         if (have_fp) {
-            fp = dav1d_msac_decode_symbol_adapt(&ts->msac,
-                                                mv_comp->class0_fp[up], 4);
+            fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+                                                 mv_comp->class0_fp[up], 4);
             hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
                                                         mv_comp->class0_hp) : 1;
         } else {
@@ -101,8 +101,8 @@
             up |= dav1d_msac_decode_bool_adapt(&ts->msac,
                                                mv_comp->classN[n]) << n;
         if (have_fp) {
-            fp = dav1d_msac_decode_symbol_adapt(&ts->msac,
-                                                mv_comp->classN_fp, 4);
+            fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+                                                 mv_comp->classN_fp, 4);
             hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
                                                         mv_comp->classN_hp) : 1;
         } else {
@@ -119,8 +119,8 @@
 static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv,
                              CdfMvContext *const mv_cdf, const int have_fp)
 {
-    switch (dav1d_msac_decode_symbol_adapt(&t->ts->msac, t->ts->cdf.mv.joint,
-                                           N_MV_JOINTS))
+    switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint,
+                                            N_MV_JOINTS))
     {
     case MV_JOINT_HV:
         ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
@@ -379,7 +379,7 @@
 {
     Dav1dTileState *const ts = t->ts;
     const Dav1dFrameContext *const f = t->f;
-    const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt(&ts->msac,
+    const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
                                            ts->cdf.m.pal_sz[pl][sz_ctx], 7) + 2;
     uint16_t cache[16], used_cache[8];
     int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
@@ -595,7 +595,7 @@
         const int last = imax(0, i - h4 * 4 + 1);
         order_palette(pal_idx, stride, i, first, last, order, ctx);
         for (int j = first, m = 0; j >= last; j--, m++) {
-            const int color_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
+            const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
                                       color_map_cdf[ctx[m]], b->pal_sz[pl]);
             pal_idx[(i - j) * stride + j] = order[m][color_idx];
         }
@@ -811,7 +811,7 @@
                 const unsigned pred_seg_id =
                     get_cur_frame_segid(t->by, t->bx, have_top, have_left,
                                         &seg_ctx, f->cur_segmap, f->b4_stride);
-                const unsigned diff = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
                                           ts->cdf.m.seg_id[seg_ctx],
                                           DAV1D_MAX_SEGMENTS);
                 const unsigned last_active_seg_id =
@@ -883,7 +883,7 @@
             if (b->skip) {
                 b->seg_id = pred_seg_id;
             } else {
-                const unsigned diff = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
                                           ts->cdf.m.seg_id[seg_ctx],
                                           DAV1D_MAX_SEGMENTS);
                 const unsigned last_active_seg_id =
@@ -932,8 +932,8 @@
         memcpy(prev_delta_lf, ts->last_delta_lf, 4);
 
         if (have_delta_q) {
-            int delta_q = dav1d_msac_decode_symbol_adapt(&ts->msac,
-                                                         ts->cdf.m.delta_q, 4);
+            int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+                                                          ts->cdf.m.delta_q, 4);
             if (delta_q == 3) {
                 const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
                 delta_q = dav1d_msac_decode_bools(&ts->msac, n_bits) +
@@ -953,7 +953,7 @@
                     f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1;
 
                 for (int i = 0; i < n_lfs; i++) {
-                    int delta_lf = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                    int delta_lf = dav1d_msac_decode_symbol_adapt4(&ts->msac,
                         ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 4);
                     if (delta_lf == 3) {
                         const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
@@ -1018,8 +1018,8 @@
             ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] :
             ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
                         [dav1d_intra_mode_context[t->l.mode[by4]]];
-        b->y_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, ymode_cdf,
-                                                   N_INTRA_PRED_MODES);
+        b->y_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, ymode_cdf,
+                                                     N_INTRA_PRED_MODES);
         if (DEBUG_BLOCK_INFO)
             printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng);
 
@@ -1028,7 +1028,7 @@
             b->y_mode <= VERT_LEFT_PRED)
         {
             uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED];
-            const int angle = dav1d_msac_decode_symbol_adapt(&ts->msac, acdf, 7);
+            const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
             b->y_angle = angle - 3;
         } else {
             b->y_angle = 0;
@@ -1038,7 +1038,7 @@
             const int cfl_allowed = f->frame_hdr->segmentation.lossless[b->seg_id] ?
                 cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs));
             uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode];
-            b->uv_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, uvmode_cdf,
+            b->uv_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, uvmode_cdf,
                              N_UV_INTRA_PRED_MODES - !cfl_allowed);
             if (DEBUG_BLOCK_INFO)
                 printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
@@ -1045,13 +1045,13 @@
 
             if (b->uv_mode == CFL_PRED) {
 #define SIGN(a) (!!(a) + ((a) > 0))
-                const int sign = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac,
                                      ts->cdf.m.cfl_sign, 8) + 1;
                 const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3;
                 assert(sign_u == sign / 3);
                 if (sign_u) {
                     const int ctx = (sign_u == 2) * 3 + sign_v;
-                    b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                    b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
                                           ts->cdf.m.cfl_alpha[ctx], 16) + 1;
                     if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0];
                 } else {
@@ -1059,7 +1059,7 @@
                 }
                 if (sign_v) {
                     const int ctx = (sign_v == 2) * 3 + sign_u;
-                    b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                    b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
                                           ts->cdf.m.cfl_alpha[ctx], 16) + 1;
                     if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1];
                 } else {
@@ -1073,7 +1073,7 @@
                        b->uv_mode <= VERT_LEFT_PRED)
             {
                 uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
-                const int angle = dav1d_msac_decode_symbol_adapt(&ts->msac, acdf, 7);
+                const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
                 b->uv_angle = angle - 3;
             } else {
                 b->uv_angle = 0;
@@ -1113,7 +1113,7 @@
                                       ts->cdf.m.use_filter_intra[bs]);
             if (is_filter) {
                 b->y_mode = FILTER_PRED;
-                b->y_angle = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac,
                                  ts->cdf.m.filter_intra, 5);
             }
             if (DEBUG_BLOCK_INFO)
@@ -1156,7 +1156,7 @@
             if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE && t_dim->max > TX_4X4) {
                 const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4);
                 uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx];
-                int depth = dav1d_msac_decode_symbol_adapt(&ts->msac, tx_cdf,
+                int depth = dav1d_msac_decode_symbol_adapt4(&ts->msac, tx_cdf,
                                 imin(t_dim->max + 1, 3));
 
                 while (depth--) {
@@ -1474,7 +1474,7 @@
                              ts->tiling.col_end, ts->tiling.row_start,
                              ts->tiling.row_end, f->libaom_cm);
 
-            b->inter_mode = dav1d_msac_decode_symbol_adapt(&ts->msac,
+            b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac,
                                 ts->cdf.m.comp_inter_mode[ctx],
                                 N_COMP_INTER_PRED_MODES);
             if (DEBUG_BLOCK_INFO)
@@ -1583,7 +1583,7 @@
                                    dav1d_msac_decode_bool_adapt(&ts->msac,
                                        ts->cdf.m.wedge_comp[ctx]);
                     if (b->comp_type == COMP_INTER_WEDGE)
-                        b->wedge_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                        b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
                                            ts->cdf.m.wedge_idx[ctx], 16);
                 } else {
                     b->comp_type = COMP_INTER_SEG;
@@ -1737,7 +1737,7 @@
                 dav1d_msac_decode_bool_adapt(&ts->msac,
                                              ts->cdf.m.interintra[ii_sz_grp]))
             {
-                b->interintra_mode = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                b->interintra_mode = dav1d_msac_decode_symbol_adapt4(&ts->msac,
                                          ts->cdf.m.interintra_mode[ii_sz_grp],
                                          N_INTER_INTRA_PRED_MODES);
                 const int wedge_ctx = dav1d_wedge_ctx_lut[bs];
@@ -1745,7 +1745,7 @@
                                      dav1d_msac_decode_bool_adapt(&ts->msac,
                                          ts->cdf.m.interintra_wedge[wedge_ctx]);
                 if (b->interintra_type == INTER_INTRA_WEDGE)
-                    b->wedge_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                    b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
                                        ts->cdf.m.wedge_idx[wedge_ctx], 16);
             } else {
                 b->interintra_type = INTER_INTRA_NONE;
@@ -1778,7 +1778,7 @@
                     f->frame_hdr->warp_motion && (mask[0] | mask[1]);
 
                 b->motion_mode = allow_warp ?
-                    dav1d_msac_decode_symbol_adapt(&ts->msac,
+                    dav1d_msac_decode_symbol_adapt4(&ts->msac,
                         ts->cdf.m.motion_mode[bs], 3) :
                     dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
                 if (b->motion_mode == MM_WARP) {
@@ -1817,7 +1817,7 @@
                 const int comp = b->comp_type != COMP_INTER_NONE;
                 const int ctx1 = get_filter_ctx(t->a, &t->l, comp, 0, b->ref[0],
                                                 by4, bx4);
-                filter[0] = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                filter[0] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
                                ts->cdf.m.filter[0][ctx1],
                                DAV1D_N_SWITCHABLE_FILTERS);
                 if (f->seq_hdr->dual_filter) {
@@ -1826,7 +1826,7 @@
                     if (DEBUG_BLOCK_INFO)
                         printf("Post-subpel_filter1[%d,ctx=%d]: r=%d\n",
                                filter[0], ctx1, ts->msac.rng);
-                    filter[1] = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                    filter[1] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
                                     ts->cdf.m.filter[1][ctx2],
                                     DAV1D_N_SWITCHABLE_FILTERS);
                     if (DEBUG_BLOCK_INFO)
@@ -2021,7 +2021,7 @@
         } else {
             const unsigned n_part = bl == BL_8X8 ? N_SUB8X8_PARTITIONS :
                 bl == BL_128X128 ? N_PARTITIONS - 2 : N_PARTITIONS;
-            bp = dav1d_msac_decode_symbol_adapt(&t->ts->msac, pc, n_part);
+            bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc, n_part);
             if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
                 (bp == PARTITION_V || bp == PARTITION_V4 ||
                  bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
@@ -2365,7 +2365,7 @@
     Dav1dTileState *const ts = t->ts;
 
     if (frame_type == DAV1D_RESTORATION_SWITCHABLE) {
-        const int filter = dav1d_msac_decode_symbol_adapt(&ts->msac,
+        const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac,
                                ts->cdf.m.restore_switchable, 3);
         lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ :
                                           DAV1D_RESTORATION_WIENER :
--- a/src/meson.build
+++ b/src/meson.build
@@ -119,6 +119,7 @@
         # NASM source files
         libdav1d_sources_asm = files(
             'x86/cpuid.asm',
+            'x86/msac.asm',
         )
 
         if dav1d_bitdepths.contains('8')
--- a/src/msac.c
+++ b/src/msac.c
@@ -58,8 +58,8 @@
  * necessary), and stores them back in the decoder context.
  * dif: The new value of dif.
  * rng: The new value of the range. */
-static inline void ctx_norm(MsacContext *s, ec_win dif, uint32_t rng) {
-    const uint16_t d = 15 - (31 ^ clz(rng));
+static inline void ctx_norm(MsacContext *s, ec_win dif, unsigned rng) {
+    const int d = 15 ^ (31 ^ clz(rng));
     assert(rng <= 65535U);
     s->cnt -= d;
     s->dif = ((dif + 1) << d) - 1; /* Shift in 1s in the LSBs */
@@ -69,18 +69,17 @@
 }
 
 unsigned dav1d_msac_decode_bool_equi(MsacContext *const s) {
-    ec_win v, vw, dif = s->dif;
-    uint16_t r = s->rng;
-    unsigned ret;
+    ec_win vw, dif = s->dif;
+    unsigned ret, v, r = s->rng;
     assert((dif >> (EC_WIN_SIZE - 16)) < r);
     // When the probability is 1/2, f = 16384 >> EC_PROB_SHIFT = 256 and we can
     // replace the multiply with a simple shift.
     v = ((r >> 8) << 7) + EC_MIN_PROB;
-    vw   = v << (EC_WIN_SIZE - 16);
+    vw   = (ec_win)v << (EC_WIN_SIZE - 16);
     ret  = dif >= vw;
     dif -= ret*vw;
     v   += ret*(r - 2*v);
-    ctx_norm(s, dif, (unsigned) v);
+    ctx_norm(s, dif, v);
     return !ret;
 }
 
@@ -88,27 +87,26 @@
  * f: The probability that the bit is one
  * Return: The value decoded (0 or 1). */
 unsigned dav1d_msac_decode_bool(MsacContext *const s, const unsigned f) {
-    ec_win v, vw, dif = s->dif;
-    uint16_t r = s->rng;
-    unsigned ret;
+    ec_win vw, dif = s->dif;
+    unsigned ret, v, r = s->rng;
     assert((dif >> (EC_WIN_SIZE - 16)) < r);
     v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB;
-    vw   = v << (EC_WIN_SIZE - 16);
+    vw   = (ec_win)v << (EC_WIN_SIZE - 16);
     ret  = dif >= vw;
     dif -= ret*vw;
     v   += ret*(r - 2*v);
-    ctx_norm(s, dif, (unsigned) v);
+    ctx_norm(s, dif, v);
     return !ret;
 }
 
-unsigned dav1d_msac_decode_bools(MsacContext *const c, const unsigned l) {
-    int v = 0;
-    for (int n = (int) l - 1; n >= 0; n--)
-        v = (v << 1) | dav1d_msac_decode_bool_equi(c);
+unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) {
+    unsigned v = 0;
+    while (n--)
+        v = (v << 1) | dav1d_msac_decode_bool_equi(s);
     return v;
 }
 
-int dav1d_msac_decode_subexp(MsacContext *const c, const int ref,
+int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
                              const int n, const unsigned k)
 {
     int i = 0;
@@ -115,32 +113,31 @@
     int a = 0;
     int b = k;
     while ((2 << b) < n) {
-        if (!dav1d_msac_decode_bool_equi(c)) break;
+        if (!dav1d_msac_decode_bool_equi(s)) break;
         b = k + i++;
         a = (1 << b);
     }
-    const unsigned v = dav1d_msac_decode_bools(c, b) + a;
+    const unsigned v = dav1d_msac_decode_bools(s, b) + a;
     return ref * 2 <= n ? inv_recenter(ref, v) :
                           n - 1 - inv_recenter(n - 1 - ref, v);
 }
 
-int dav1d_msac_decode_uniform(MsacContext *const c, const unsigned n) {
+int dav1d_msac_decode_uniform(MsacContext *const s, const unsigned n) {
     assert(n > 0);
     const int l = ulog2(n) + 1;
     assert(l > 1);
     const unsigned m = (1 << l) - n;
-    const unsigned v = dav1d_msac_decode_bools(c, l - 1);
-    return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(c);
+    const unsigned v = dav1d_msac_decode_bools(s, l - 1);
+    return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(s);
 }
 
 /* Decodes a symbol given an inverse cumulative distribution function (CDF)
  * table in Q15. */
 static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf,
-                              const unsigned n_symbols)
+                              const size_t n_symbols)
 {
-    ec_win u, v = s->rng, r = s->rng >> 8;
-    const ec_win c = s->dif >> (EC_WIN_SIZE - 16);
-    unsigned ret = 0;
+    const unsigned c = s->dif >> (EC_WIN_SIZE - 16);
+    unsigned u, v = s->rng, r = s->rng >> 8, ret = 0;
 
     assert(!cdf[n_symbols - 1]);
 
@@ -153,39 +150,34 @@
 
     assert(u <= s->rng);
 
-    ctx_norm(s, s->dif - (v << (EC_WIN_SIZE - 16)), (unsigned) (u - v));
+    ctx_norm(s, s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)), u - v);
     return ret - 1;
 }
 
-static void update_cdf(uint16_t *const cdf, const unsigned val,
-                       const unsigned n_symbols)
+unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
+                                          uint16_t *const cdf,
+                                          const size_t n_symbols)
 {
-    const unsigned count = cdf[n_symbols];
-    const int rate = ((count >> 4) | 4) + (n_symbols > 3);
-    unsigned i;
-    for (i = 0; i < val; i++)
-        cdf[i] += (32768 - cdf[i]) >> rate;
-    for (; i < n_symbols - 1; i++)
-        cdf[i] -= cdf[i] >> rate;
-    cdf[n_symbols] = count + (count < 32);
-}
-
-unsigned dav1d_msac_decode_symbol_adapt(MsacContext *const c,
-                                        uint16_t *const cdf,
-                                        const unsigned n_symbols)
-{
-    const unsigned val = decode_symbol(c, cdf, n_symbols);
-    if(c->allow_update_cdf)
-        update_cdf(cdf, val, n_symbols);
+    const unsigned val = decode_symbol(s, cdf, n_symbols);
+    if (s->allow_update_cdf) {
+        const unsigned count = cdf[n_symbols];
+        const int rate = ((count >> 4) | 4) + (n_symbols > 3);
+        unsigned i;
+        for (i = 0; i < val; i++)
+            cdf[i] += (32768 - cdf[i]) >> rate;
+        for (; i < n_symbols - 1; i++)
+            cdf[i] -= cdf[i] >> rate;
+        cdf[n_symbols] = count + (count < 32);
+    }
     return val;
 }
 
-unsigned dav1d_msac_decode_bool_adapt(MsacContext *const c,
+unsigned dav1d_msac_decode_bool_adapt(MsacContext *const s,
                                       uint16_t *const cdf)
 {
-    const unsigned bit = dav1d_msac_decode_bool(c, *cdf);
+    const unsigned bit = dav1d_msac_decode_bool(s, *cdf);
 
-    if(c->allow_update_cdf){
+    if (s->allow_update_cdf) {
         // update_cdf() specialized for boolean CDFs
         const unsigned count = cdf[1];
         const int rate = (count >> 4) | 4;
--- a/src/msac.h
+++ b/src/msac.h
@@ -38,20 +38,37 @@
     const uint8_t *buf_pos;
     const uint8_t *buf_end;
     ec_win dif;
-    uint16_t rng;
+    unsigned rng;
     int cnt;
     int allow_update_cdf;
 } MsacContext;
 
-void dav1d_msac_init(MsacContext *c, const uint8_t *data, size_t sz,
+void dav1d_msac_init(MsacContext *s, const uint8_t *data, size_t sz,
                      int disable_cdf_update_flag);
-unsigned dav1d_msac_decode_symbol_adapt(MsacContext *s, uint16_t *cdf,
-                                        const unsigned n_symbols);
-unsigned dav1d_msac_decode_bool_equi(MsacContext *const s);
+unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf,
+                                          size_t n_symbols);
+unsigned dav1d_msac_decode_bool_equi(MsacContext *s);
 unsigned dav1d_msac_decode_bool(MsacContext *s, unsigned f);
 unsigned dav1d_msac_decode_bool_adapt(MsacContext *s, uint16_t *cdf);
-unsigned dav1d_msac_decode_bools(MsacContext *c, unsigned l);
-int dav1d_msac_decode_subexp(MsacContext *c, int ref, int n, unsigned k);
-int dav1d_msac_decode_uniform(MsacContext *c, unsigned n);
+unsigned dav1d_msac_decode_bools(MsacContext *s, unsigned n);
+int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
+int dav1d_msac_decode_uniform(MsacContext *s, unsigned n);
+
+/* Supported n_symbols ranges: adapt4: 1-5, adapt8: 1-8, adapt16: 4-16 */
+#if ARCH_X86_64 && HAVE_ASM
+unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
+                                              size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
+                                              size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
+                                               size_t n_symbols);
+#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
+#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
+#else
+#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt_c
+#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt_c
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt_c
+#endif
 
 #endif /* DAV1D_SRC_MSAC_H */
--- a/src/recon_tmpl.c
+++ b/src/recon_tmpl.c
@@ -107,7 +107,9 @@
             uint16_t *const txtp_cdf = intra ?
                        ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :
                        ts->cdf.m.txtp_inter[set_idx][t_dim->min];
-            idx = dav1d_msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt);
+            idx = (set_cnt <= 8 ? dav1d_msac_decode_symbol_adapt8 :
+                     dav1d_msac_decode_symbol_adapt16)(&ts->msac, txtp_cdf, set_cnt);
+
             if (dbg)
             printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",
                    set, set_idx, tx, t_dim->min, intra ? (int)y_mode_nofilt : -1,
@@ -122,19 +124,19 @@
     const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
     const int is_1d = tx_class != TX_CLASS_2D;
     switch (tx2dszctx) {
-#define case_sz(sz, bin) \
+#define case_sz(sz, bin, ns) \
     case sz: { \
         uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \
-        eob_bin = dav1d_msac_decode_symbol_adapt(&ts->msac, eob_bin_cdf, 5 + sz); \
+        eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 5 + sz); \
         break; \
     }
-    case_sz(0,   16);
-    case_sz(1,   32);
-    case_sz(2,   64);
-    case_sz(3,  128);
-    case_sz(4,  256);
-    case_sz(5,  512);
-    case_sz(6, 1024);
+    case_sz(0,   16,  4);
+    case_sz(1,   32,  8);
+    case_sz(2,   64,  8);
+    case_sz(3,  128,  8);
+    case_sz(4,  256, 16);
+    case_sz(5,  512, 16);
+    case_sz(6, 1024, 16);
 #undef case_sz
     }
     if (dbg)
@@ -179,8 +181,8 @@
         uint16_t *const lo_cdf = is_last ?
             ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx] :
             ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
-        int tok = dav1d_msac_decode_symbol_adapt(&ts->msac, lo_cdf,
-                                                 4 - is_last) + is_last;
+        int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf,
+                                                  4 - is_last) + is_last;
         if (dbg)
         printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
                t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng);
@@ -190,7 +192,7 @@
         if (tok == 3) {
             const int br_ctx = get_br_ctx(levels, rc, tx, tx_class);
             do {
-                const int tok_br = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
                                        br_cdf[br_ctx], 4);
                 if (dbg)
                 printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",
--- /dev/null
+++ b/src/x86/msac.asm
@@ -1,0 +1,287 @@
+; Copyright © 2019, VideoLAN and dav1d authors
+; Copyright © 2019, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64 ; avoids cacheline splits
+
+dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+pw_0xff00: times 8 dw 0xff00
+pw_32:     times 8 dw 32
+
+struc msac
+    .buf:        resq 1
+    .end:        resq 1
+    .dif:        resq 1
+    .rng:        resd 1
+    .cnt:        resd 1
+    .update_cdf: resd 1
+endstruc
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+SECTION .text
+
+%if WIN64
+DECLARE_REG_TMP 3
+%define buf rsp+8 ; shadow space
+%else
+DECLARE_REG_TMP 0
+%define buf rsp-40 ; red zone
+%endif
+
+INIT_XMM sse2
+cglobal msac_decode_symbol_adapt4, 3, 7, 6, s, cdf, ns
+    movd           m2, [sq+msac.rng]
+    movq           m1, [cdfq]
+    lea           rax, [pw_0xff00]
+    movq           m3, [sq+msac.dif]
+    mov           r3d, [sq+msac.update_cdf]
+    mov           r4d, nsd
+    neg           nsq
+    pshuflw        m2, m2, q0000
+    movd     [buf+12], m2
+    pand           m2, [rax]
+    mova           m0, m1
+    psrlw          m1, 6
+    psllw          m1, 7
+    pmulhuw        m1, m2
+    movq           m2, [rax+nsq*2]
+    pshuflw        m3, m3, q3333
+    paddw          m1, m2
+    mova     [buf+16], m1
+    psubusw        m1, m3
+    pxor           m2, m2
+    pcmpeqw        m1, m2 ; c >= v
+    pmovmskb      eax, m1
+    test          r3d, r3d
+    jz .renorm ; !allow_update_cdf
+
+; update_cdf:
+    movzx         r3d, word [cdfq+r4*2] ; count
+    pcmpeqw        m2, m2
+    mov           r2d, r3d
+    shr           r3d, 4
+    cmp           r4d, 4
+    sbb           r3d, -5 ; (count >> 4) + (n_symbols > 3) + 4
+    cmp           r2d, 32
+    adc           r2d, 0  ; count + (count < 32)
+    movd           m3, r3d
+    pavgw          m2, m1 ; i >= val ? -1 : 32768
+    psubw          m2, m0 ; for (i = 0; i < val; i++)
+    psubw          m0, m1 ;     cdf[i] += (32768 - cdf[i]) >> rate;
+    psraw          m2, m3 ; for (; i < n_symbols - 1; i++)
+    paddw          m0, m2 ;     cdf[i] += ((  -1 - cdf[i]) >> rate) + 1;
+    movq       [cdfq], m0
+    mov   [cdfq+r4*2], r2w
+
+.renorm:
+    tzcnt         eax, eax
+    mov            r4, [sq+msac.dif]
+    movzx         r1d, word [buf+rax+16] ; v
+    movzx         r2d, word [buf+rax+14] ; u
+    shr           eax, 1
+.renorm2:
+    not            r4
+    sub           r2d, r1d ; rng
+    shl            r1, 48
+    add            r4, r1  ; ~dif
+    mov           r1d, [sq+msac.cnt]
+    movifnidn      t0, sq
+    bsr           ecx, r2d
+    xor           ecx, 15  ; d
+    shl           r2d, cl
+    shl            r4, cl
+    mov [t0+msac.rng], r2d
+    not            r4
+    sub           r1d, ecx
+    jge .end ; no refill required
+
+; refill:
+    mov            r2, [t0+msac.buf]
+    mov           rcx, [t0+msac.end]
+    lea            r5, [r2+8]
+    cmp            r5, rcx
+    jg .refill_eob
+    mov            r2, [r2]
+    lea           ecx, [r1+23]
+    add           r1d, 16
+    shr           ecx, 3   ; shift_bytes
+    bswap          r2
+    sub            r5, rcx
+    shl           ecx, 3   ; shift_bits
+    shr            r2, cl
+    sub           ecx, r1d ; shift_bits - 16 - cnt
+    mov           r1d, 48
+    shl            r2, cl
+    mov [t0+msac.buf], r5
+    sub           r1d, ecx ; cnt + 64 - shift_bits
+    xor            r4, r2
+.end:
+    mov [t0+msac.cnt], r1d
+    mov [t0+msac.dif], r4
+    RET
+.refill_eob: ; avoid overreading the input buffer
+    mov            r5, rcx
+    mov           ecx, 40
+    sub           ecx, r1d ; c
+.refill_eob_loop:
+    cmp            r2, r5
+    jge .refill_eob_end    ; eob reached
+    movzx         r1d, byte [r2]
+    inc            r2
+    shl            r1, cl
+    xor            r4, r1
+    sub           ecx, 8
+    jge .refill_eob_loop
+.refill_eob_end:
+    mov           r1d, 40
+    sub           r1d, ecx
+    mov [t0+msac.buf], r2
+    mov [t0+msac.dif], r4
+    mov [t0+msac.cnt], r1d
+    RET
+
+cglobal msac_decode_symbol_adapt8, 3, 7, 6, s, cdf, ns
+    movd           m2, [sq+msac.rng]
+    movu           m1, [cdfq]
+    lea           rax, [pw_0xff00]
+    movq           m3, [sq+msac.dif]
+    mov           r3d, [sq+msac.update_cdf]
+    mov           r4d, nsd
+    neg           nsq
+    pshuflw        m2, m2, q0000
+    movd     [buf+12], m2
+    punpcklqdq     m2, m2
+    mova           m0, m1
+    psrlw          m1, 6
+    pand           m2, [rax]
+    psllw          m1, 7
+    pmulhuw        m1, m2
+    movu           m2, [rax+nsq*2]
+    pshuflw        m3, m3, q3333
+    paddw          m1, m2
+    punpcklqdq     m3, m3
+    mova     [buf+16], m1
+    psubusw        m1, m3
+    pxor           m2, m2
+    pcmpeqw        m1, m2
+    pmovmskb      eax, m1
+    test          r3d, r3d
+    jz m(msac_decode_symbol_adapt4).renorm
+    movzx         r3d, word [cdfq+r4*2]
+    pcmpeqw        m2, m2
+    mov           r2d, r3d
+    shr           r3d, 4
+    cmp           r4d, 4 ; may be called with n_symbols < 4
+    sbb           r3d, -5
+    cmp           r2d, 32
+    adc           r2d, 0
+    movd           m3, r3d
+    pavgw          m2, m1
+    psubw          m2, m0
+    psubw          m0, m1
+    psraw          m2, m3
+    paddw          m0, m2
+    movu       [cdfq], m0
+    mov   [cdfq+r4*2], r2w
+    jmp m(msac_decode_symbol_adapt4).renorm
+
+cglobal msac_decode_symbol_adapt16, 3, 7, 6, s, cdf, ns
+    movd           m4, [sq+msac.rng]
+    movu           m2, [cdfq]
+    lea           rax, [pw_0xff00]
+    movu           m3, [cdfq+16]
+    movq           m5, [sq+msac.dif]
+    mov           r3d, [sq+msac.update_cdf]
+    mov           r4d, nsd
+    neg           nsq
+%if WIN64
+    sub           rsp, 48 ; need 36 bytes, shadow space is only 32
+%endif
+    pshuflw        m4, m4, q0000
+    movd      [buf-4], m4
+    punpcklqdq     m4, m4
+    mova           m0, m2
+    psrlw          m2, 6
+    mova           m1, m3
+    psrlw          m3, 6
+    pand           m4, [rax]
+    psllw          m2, 7
+    psllw          m3, 7
+    pmulhuw        m2, m4
+    pmulhuw        m3, m4
+    movu           m4, [rax+nsq*2]
+    pshuflw        m5, m5, q3333
+    paddw          m2, m4
+    psubw          m4, [rax-pw_0xff00+pw_32]
+    punpcklqdq     m5, m5
+    paddw          m3, m4
+    mova        [buf], m2
+    mova     [buf+16], m3
+    psubusw        m2, m5
+    psubusw        m3, m5
+    pxor           m4, m4
+    pcmpeqw        m2, m4
+    pcmpeqw        m3, m4
+    packsswb       m5, m2, m3
+    pmovmskb      eax, m5
+    test          r3d, r3d
+    jz .renorm
+    movzx         r3d, word [cdfq+r4*2]
+    pcmpeqw        m4, m4
+    mova           m5, m4
+    lea           r2d, [r3+80] ; only support n_symbols >= 4
+    shr           r2d, 4
+    cmp           r3d, 32
+    adc           r3d, 0
+    pavgw          m4, m2
+    pavgw          m5, m3
+    psubw          m4, m0
+    psubw          m0, m2
+    movd           m2, r2d
+    psubw          m5, m1
+    psubw          m1, m3
+    psraw          m4, m2
+    psraw          m5, m2
+    paddw          m0, m4
+    paddw          m1, m5
+    movu       [cdfq], m0
+    movu    [cdfq+16], m1
+    mov   [cdfq+r4*2], r3w
+.renorm:
+    tzcnt         eax, eax
+    mov            r4, [sq+msac.dif]
+    movzx         r1d, word [buf+rax*2]
+    movzx         r2d, word [buf+rax*2-2]
+%if WIN64
+    add           rsp, 48
+%endif
+    jmp m(msac_decode_symbol_adapt4).renorm2
+
+%endif
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -62,6 +62,7 @@
     const char *name;
     void (*func)(void);
 } tests[] = {
+    { "msac", checkasm_check_msac },
 #if CONFIG_8BPC
     { "cdef_8bpc", checkasm_check_cdef_8bpc },
     { "ipred_8bpc", checkasm_check_ipred_8bpc },
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -57,6 +57,7 @@
 name##_8bpc(void); \
 name##_16bpc(void)
 
+void checkasm_check_msac(void);
 decl_check_bitfns(void checkasm_check_cdef);
 decl_check_bitfns(void checkasm_check_ipred);
 decl_check_bitfns(void checkasm_check_itx);
--- /dev/null
+++ b/tests/checkasm/msac.c
@@ -1,0 +1,115 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "tests/checkasm/checkasm.h"
+
+#include "src/cpu.h"
+#include "src/msac.h"
+
+#include <string.h>
+
+/* The normal code doesn't use function pointers */
+typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf,
+                                           size_t n_symbols);
+
+typedef struct {
+    decode_symbol_adapt_fn symbol_adapt4;
+    decode_symbol_adapt_fn symbol_adapt8;
+    decode_symbol_adapt_fn symbol_adapt16;
+} MsacDSPContext;
+
+static void randomize_cdf(uint16_t *const cdf, int n) {
+    for (int i = 16; i > n; i--)
+        cdf[i] = rnd(); /* randomize padding */
+    cdf[n] = cdf[n-1] = 0;
+    while (--n > 0)
+        cdf[n-1] = cdf[n] + rnd() % (32768 - cdf[n] - n) + 1;
+}
+
+/* memcmp() on structs can have weird behavior due to padding etc. */
+static int msac_cmp(const MsacContext *const a, const MsacContext *const b) {
+    return a->buf_pos != b->buf_pos || a->buf_end != b->buf_end ||
+           a->dif != b->dif || a->rng != b->rng || a->cnt != b->cnt ||
+           a->allow_update_cdf != b->allow_update_cdf;
+}
+
+#define CHECK_SYMBOL_ADAPT(n, n_min, n_max) do {                           \
+    if (check_func(c->symbol_adapt##n, "msac_decode_symbol_adapt%d", n)) { \
+        for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {          \
+            for (int ns = n_min; ns <= n_max; ns++) {                      \
+                dav1d_msac_init(&s_c, buf, sizeof(buf), !cdf_update);      \
+                s_a = s_c;                                                 \
+                randomize_cdf(cdf[0], ns);                                 \
+                memcpy(cdf[1], cdf[0], sizeof(*cdf));                      \
+                for (int i = 0; i < 64; i++) {                             \
+                    unsigned c_res = call_ref(&s_c, cdf[0], ns);           \
+                    unsigned a_res = call_new(&s_a, cdf[1], ns);           \
+                    if (c_res != a_res || msac_cmp(&s_c, &s_a) ||          \
+                        memcmp(cdf[0], cdf[1], sizeof(**cdf) * (ns + 1)))  \
+                    {                                                      \
+                        fail();                                            \
+                    }                                                      \
+                }                                                          \
+                if (cdf_update && ns == n)                                 \
+                    bench_new(&s_a, cdf[0], n);                            \
+            }                                                              \
+        }                                                                  \
+    }                                                                      \
+} while (0)
+
+static void check_decode_symbol_adapt(MsacDSPContext *const c) {
+    /* Use an aligned CDF buffer for more consistent benchmark
+     * results, and a misaligned one for checking correctness. */
+    ALIGN_STK_16(uint16_t, cdf, 2, [17]);
+    MsacContext s_c, s_a;
+    uint8_t buf[1024];
+    for (int i = 0; i < 1024; i++)
+        buf[i] = rnd();
+
+    declare_func(unsigned, MsacContext *s, uint16_t *cdf, size_t n_symbols);
+    CHECK_SYMBOL_ADAPT( 4, 1,  5);
+    CHECK_SYMBOL_ADAPT( 8, 1,  8);
+    CHECK_SYMBOL_ADAPT(16, 4, 16);
+    report("decode_symbol_adapt");
+}
+
+void checkasm_check_msac(void) {
+    MsacDSPContext c;
+    c.symbol_adapt4  = dav1d_msac_decode_symbol_adapt_c;
+    c.symbol_adapt8  = dav1d_msac_decode_symbol_adapt_c;
+    c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
+
+#if ARCH_X86_64 && HAVE_ASM
+    if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {
+        c.symbol_adapt4  = dav1d_msac_decode_symbol_adapt4_sse2;
+        c.symbol_adapt8  = dav1d_msac_decode_symbol_adapt8_sse2;
+        c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
+    }
+#endif
+
+    check_decode_symbol_adapt(&c);
+}
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -34,7 +34,10 @@
 libdav1d_nasm_objs_if_needed = []
 
 if is_asm_enabled
-    checkasm_sources = files('checkasm/checkasm.c')
+    checkasm_sources = files(
+        'checkasm/checkasm.c',
+        'checkasm/msac.c',
+    )
 
     checkasm_tmpl_sources = files(
         'checkasm/cdef.c',