shithub: libvpx

Download patch

ref: a1e20ec58f3b562af536be80f8dd061878cbaffd
parent: c77d4dcb355798870bde4e759a547203e899bcc9
author: Alex Converse <aconverse@google.com>
date: Fri Mar 20 10:51:59 EDT 2015

Refactor fast loop filter code to handle 444.

Change-Id: I921b1ebabdf617049f8fa26fbe462c3ff115c1ce

--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -293,7 +293,7 @@
   }
 }
 
-static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
+static void filter_selectively_vert_row2(int subsampling_factor,
                                          uint8_t *s, int pitch,
                                          unsigned int mask_16x16_l,
                                          unsigned int mask_8x8_l,
@@ -301,9 +301,9 @@
                                          unsigned int mask_4x4_int_l,
                                          const loop_filter_info_n *lfi_n,
                                          const uint8_t *lfl) {
-  const int mask_shift = plane_type ? 4 : 8;
-  const int mask_cutoff = plane_type ? 0xf : 0xff;
-  const int lfl_forward = plane_type ? 4 : 8;
+  const int mask_shift = subsampling_factor ? 4 : 8;
+  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int lfl_forward = subsampling_factor ? 4 : 8;
 
   unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
   unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
@@ -393,7 +393,7 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_filter_selectively_vert_row2(PLANE_TYPE plane_type,
+static void highbd_filter_selectively_vert_row2(int subsampling_factor,
                                                 uint16_t *s, int pitch,
                                                 unsigned int mask_16x16_l,
                                                 unsigned int mask_8x8_l,
@@ -401,9 +401,9 @@
                                                 unsigned int mask_4x4_int_l,
                                                 const loop_filter_info_n *lfi_n,
                                                 const uint8_t *lfl, int bd) {
-  const int mask_shift = plane_type ? 4 : 8;
-  const int mask_cutoff = plane_type ? 0xf : 0xff;
-  const int lfl_forward = plane_type ? 4 : 8;
+  const int mask_shift = subsampling_factor ? 4 : 8;
+  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int lfl_forward = subsampling_factor ? 4 : 8;
 
   unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
   unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
@@ -1326,248 +1326,203 @@
   }
 }
 
-void vp9_filter_block_plane(VP9_COMMON *const cm,
-                            struct macroblockd_plane *const plane,
-                            int mi_row,
-                            LOOP_FILTER_MASK *lfm) {
+void vp9_filter_block_plane_ss00(VP9_COMMON *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm) {
   struct buf_2d *const dst = &plane->dst;
-  uint8_t* const dst0 = dst->buf;
-  int r, c;
+  uint8_t *const dst0 = dst->buf;
+  int r;
+  uint64_t mask_16x16 = lfm->left_y[TX_16X16];
+  uint64_t mask_8x8 = lfm->left_y[TX_8X8];
+  uint64_t mask_4x4 = lfm->left_y[TX_4X4];
+  uint64_t mask_4x4_int = lfm->int_4x4_y;
 
-  if (!plane->plane_type) {
-    uint64_t mask_16x16 = lfm->left_y[TX_16X16];
-    uint64_t mask_8x8 = lfm->left_y[TX_8X8];
-    uint64_t mask_4x4 = lfm->left_y[TX_4X4];
-    uint64_t mask_4x4_int = lfm->int_4x4_y;
+  assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
 
-    // Vertical pass: do 2 rows at one time
-    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
-      unsigned int mask_16x16_l = mask_16x16 & 0xffff;
-      unsigned int mask_8x8_l = mask_8x8 & 0xffff;
-      unsigned int mask_4x4_l = mask_4x4 & 0xffff;
-      unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
+  // Vertical pass: do 2 rows at one time
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+    unsigned int mask_16x16_l = mask_16x16 & 0xffff;
+    unsigned int mask_8x8_l = mask_8x8 & 0xffff;
+    unsigned int mask_4x4_l = mask_4x4 & 0xffff;
+    unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
 
-      // Disable filtering on the leftmost column.
+// Disable filtering on the leftmost column.
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        highbd_filter_selectively_vert_row2(plane->plane_type,
-                                            CONVERT_TO_SHORTPTR(dst->buf),
-                                            dst->stride,
-                                            mask_16x16_l,
-                                            mask_8x8_l,
-                                            mask_4x4_l,
-                                            mask_4x4_int_l,
-                                            &cm->lf_info, &lfm->lfl_y[r << 3],
-                                            (int)cm->bit_depth);
-      } else {
-        filter_selectively_vert_row2(plane->plane_type,
-                                     dst->buf, dst->stride,
-                                     mask_16x16_l,
-                                     mask_8x8_l,
-                                     mask_4x4_l,
-                                     mask_4x4_int_l,
-                                     &cm->lf_info,
-                                     &lfm->lfl_y[r << 3]);
-      }
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_vert_row2(
+          plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
+          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+          &lfm->lfl_y[r << 3], (int)cm->bit_depth);
+    } else {
+      filter_selectively_vert_row2(
+          plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
+          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+    }
 #else
-      filter_selectively_vert_row2(plane->plane_type,
-                                   dst->buf, dst->stride,
-                                   mask_16x16_l,
-                                   mask_8x8_l,
-                                   mask_4x4_l,
-                                   mask_4x4_int_l,
-                                   &cm->lf_info, &lfm->lfl_y[r << 3]);
+    filter_selectively_vert_row2(
+        plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
+        mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-      dst->buf += 16 * dst->stride;
-      mask_16x16 >>= 16;
-      mask_8x8 >>= 16;
-      mask_4x4 >>= 16;
-      mask_4x4_int >>= 16;
+    dst->buf += 16 * dst->stride;
+    mask_16x16 >>= 16;
+    mask_8x8 >>= 16;
+    mask_4x4 >>= 16;
+    mask_4x4_int >>= 16;
+  }
+
+  // Horizontal pass
+  dst->buf = dst0;
+  mask_16x16 = lfm->above_y[TX_16X16];
+  mask_8x8 = lfm->above_y[TX_8X8];
+  mask_4x4 = lfm->above_y[TX_4X4];
+  mask_4x4_int = lfm->int_4x4_y;
+
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
+
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16 & 0xff;
+      mask_8x8_r = mask_8x8 & 0xff;
+      mask_4x4_r = mask_4x4 & 0xff;
     }
 
-    // Horizontal pass
-    dst->buf = dst0;
-    mask_16x16 = lfm->above_y[TX_16X16];
-    mask_8x8 = lfm->above_y[TX_8X8];
-    mask_4x4 = lfm->above_y[TX_4X4];
-    mask_4x4_int = lfm->int_4x4_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
+          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3],
+          (int)cm->bit_depth);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                               mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
+                               &lfm->lfl_y[r << 3]);
+    }
+#else
+    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                             mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
+                             &lfm->lfl_y[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
-      unsigned int mask_16x16_r;
-      unsigned int mask_8x8_r;
-      unsigned int mask_4x4_r;
+    dst->buf += 8 * dst->stride;
+    mask_16x16 >>= 8;
+    mask_8x8 >>= 8;
+    mask_4x4 >>= 8;
+    mask_4x4_int >>= 8;
+  }
+}
 
-      if (mi_row + r == 0) {
-        mask_16x16_r = 0;
-        mask_8x8_r = 0;
-        mask_4x4_r = 0;
-      } else {
-        mask_16x16_r = mask_16x16 & 0xff;
-        mask_8x8_r = mask_8x8 & 0xff;
-        mask_4x4_r = mask_4x4 & 0xff;
+void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm) {
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  int r, c;
+
+  uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
+  uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
+  uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
+  uint16_t mask_4x4_int = lfm->int_4x4_uv;
+
+  assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
+
+  // Vertical pass: do 2 rows at one time
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
+    if (plane->plane_type == 1) {
+      for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
+        lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
+        lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
       }
+    }
 
+    {
+      unsigned int mask_16x16_l = mask_16x16 & 0xff;
+      unsigned int mask_8x8_l = mask_8x8 & 0xff;
+      unsigned int mask_4x4_l = mask_4x4 & 0xff;
+      unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
+
+// Disable filtering on the leftmost column.
 #if CONFIG_VP9_HIGHBITDEPTH
       if (cm->use_highbitdepth) {
-        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
-                                        dst->stride,
-                                        mask_16x16_r,
-                                        mask_8x8_r,
-                                        mask_4x4_r,
-                                        mask_4x4_int & 0xff,
-                                        &cm->lf_info,
-                                        &lfm->lfl_y[r << 3],
-                                        (int)cm->bit_depth);
+        highbd_filter_selectively_vert_row2(
+            plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
+            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+            &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
       } else {
-        filter_selectively_horiz(dst->buf, dst->stride,
-                                 mask_16x16_r,
-                                 mask_8x8_r,
-                                 mask_4x4_r,
-                                 mask_4x4_int & 0xff,
-                                 &cm->lf_info,
-                                 &lfm->lfl_y[r << 3]);
+        filter_selectively_vert_row2(
+            plane->subsampling_x, dst->buf, dst->stride,
+            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+            &lfm->lfl_uv[r << 1]);
       }
 #else
-      filter_selectively_horiz(dst->buf, dst->stride,
-                               mask_16x16_r,
-                               mask_8x8_r,
-                               mask_4x4_r,
-                               mask_4x4_int & 0xff,
-                               &cm->lf_info,
-                               &lfm->lfl_y[r << 3]);
+      filter_selectively_vert_row2(
+          plane->subsampling_x, dst->buf, dst->stride,
+          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+          &lfm->lfl_uv[r << 1]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-      dst->buf += 8 * dst->stride;
+      dst->buf += 16 * dst->stride;
       mask_16x16 >>= 8;
       mask_8x8 >>= 8;
       mask_4x4 >>= 8;
       mask_4x4_int >>= 8;
     }
-  } else {
-    uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
-    uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
-    uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
-    uint16_t mask_4x4_int = lfm->int_4x4_uv;
+  }
 
-    // Vertical pass: do 2 rows at one time
-    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
-      if (plane->plane_type == 1) {
-        for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
-          lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
-          lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) +
-                                                       (c << 1)];
-        }
-      }
+  // Horizontal pass
+  dst->buf = dst0;
+  mask_16x16 = lfm->above_uv[TX_16X16];
+  mask_8x8 = lfm->above_uv[TX_8X8];
+  mask_4x4 = lfm->above_uv[TX_4X4];
+  mask_4x4_int = lfm->int_4x4_uv;
 
-      {
-        unsigned int mask_16x16_l = mask_16x16 & 0xff;
-        unsigned int mask_8x8_l = mask_8x8 & 0xff;
-        unsigned int mask_4x4_l = mask_4x4 & 0xff;
-        unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+    const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
+    const unsigned int mask_4x4_int_r =
+        skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf);
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
 
-        // Disable filtering on the leftmost column.
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (cm->use_highbitdepth) {
-          highbd_filter_selectively_vert_row2(plane->plane_type,
-                                              CONVERT_TO_SHORTPTR(dst->buf),
-                                              dst->stride,
-                                              mask_16x16_l,
-                                              mask_8x8_l,
-                                              mask_4x4_l,
-                                              mask_4x4_int_l,
-                                              &cm->lf_info,
-                                              &lfm->lfl_uv[r << 1],
-                                              (int)cm->bit_depth);
-        } else {
-          filter_selectively_vert_row2(plane->plane_type,
-                                       dst->buf, dst->stride,
-                                       mask_16x16_l,
-                                       mask_8x8_l,
-                                       mask_4x4_l,
-                                       mask_4x4_int_l,
-                                       &cm->lf_info,
-                                       &lfm->lfl_uv[r << 1]);
-        }
-#else
-        filter_selectively_vert_row2(plane->plane_type,
-                                     dst->buf, dst->stride,
-                                     mask_16x16_l,
-                                     mask_8x8_l,
-                                     mask_4x4_l,
-                                     mask_4x4_int_l,
-                                     &cm->lf_info,
-                                     &lfm->lfl_uv[r << 1]);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-        dst->buf += 16 * dst->stride;
-        mask_16x16 >>= 8;
-        mask_8x8 >>= 8;
-        mask_4x4 >>= 8;
-        mask_4x4_int >>= 8;
-      }
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16 & 0xf;
+      mask_8x8_r = mask_8x8 & 0xf;
+      mask_4x4_r = mask_4x4 & 0xf;
     }
 
-    // Horizontal pass
-    dst->buf = dst0;
-    mask_16x16 = lfm->above_uv[TX_16X16];
-    mask_8x8 = lfm->above_uv[TX_8X8];
-    mask_4x4 = lfm->above_uv[TX_4X4];
-    mask_4x4_int = lfm->int_4x4_uv;
-
-    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
-      const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
-      const unsigned int mask_4x4_int_r = skip_border_4x4_r ?
-          0 : (mask_4x4_int & 0xf);
-      unsigned int mask_16x16_r;
-      unsigned int mask_8x8_r;
-      unsigned int mask_4x4_r;
-
-      if (mi_row + r == 0) {
-        mask_16x16_r = 0;
-        mask_8x8_r = 0;
-        mask_4x4_r = 0;
-      } else {
-        mask_16x16_r = mask_16x16 & 0xf;
-        mask_8x8_r = mask_8x8 & 0xf;
-        mask_4x4_r = mask_4x4 & 0xf;
-      }
-
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
-                                        dst->stride,
-                                        mask_16x16_r,
-                                        mask_8x8_r,
-                                        mask_4x4_r,
-                                        mask_4x4_int_r,
-                                        &cm->lf_info,
-                                        &lfm->lfl_uv[r << 1],
-                                        (int)cm->bit_depth);
-      } else {
-        filter_selectively_horiz(dst->buf, dst->stride,
-                                 mask_16x16_r,
-                                 mask_8x8_r,
-                                 mask_4x4_r,
-                                 mask_4x4_int_r,
-                                 &cm->lf_info,
-                                 &lfm->lfl_uv[r << 1]);
-      }
-#else
-      filter_selectively_horiz(dst->buf, dst->stride,
-                               mask_16x16_r,
-                               mask_8x8_r,
-                               mask_4x4_r,
-                               mask_4x4_int_r,
-                               &cm->lf_info,
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride, mask_16x16_r, mask_8x8_r,
+                                      mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                                      &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                               mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
                                &lfm->lfl_uv[r << 1]);
+    }
+#else
+    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                             mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                             &lfm->lfl_uv[r << 1]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-      dst->buf += 8 * dst->stride;
-      mask_16x16 >>= 4;
-      mask_8x8 >>= 4;
-      mask_4x4 >>= 4;
-      mask_4x4_int >>= 4;
-    }
+    dst->buf += 8 * dst->stride;
+    mask_16x16 >>= 4;
+    mask_8x8 >>= 4;
+    mask_4x4 >>= 4;
+    mask_4x4_int >>= 4;
   }
 }
 
@@ -1576,11 +1531,19 @@
                           struct macroblockd_plane planes[MAX_MB_PLANE],
                           int start, int stop, int y_only) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  const int use_420 = y_only || (planes[1].subsampling_y == 1 &&
-                                 planes[1].subsampling_x == 1);
+  enum lf_path path;
   LOOP_FILTER_MASK lfm;
   int mi_row, mi_col;
 
+  if (y_only)
+    path = LF_PATH_444;
+  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+    path = LF_PATH_420;
+  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+    path = LF_PATH_444;
+  else
+    path = LF_PATH_SLOW;
+
   for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
     MODE_INFO *mi = cm->mi + mi_row * cm->mi_stride;
 
@@ -1590,16 +1553,23 @@
       vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
       // TODO(JBB): Make setup_mask work for non 420.
-      if (use_420)
-        vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
-                       &lfm);
+      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+                     &lfm);
 
-      for (plane = 0; plane < num_planes; ++plane) {
-        if (use_420)
-          vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
-        else
-          vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
-                                        mi_row, mi_col);
+      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+      for (plane = 1; plane < num_planes; ++plane) {
+        switch (path) {
+          case LF_PATH_420:
+            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_444:
+            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_SLOW:
+            vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+            break;
+        }
       }
     }
   }
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -29,6 +29,12 @@
 #define MAX_REF_LF_DELTAS       4
 #define MAX_MODE_LF_DELTAS      2
 
+enum lf_path {
+  LF_PATH_420,
+  LF_PATH_444,
+  LF_PATH_SLOW,
+};
+
 struct loopfilter {
   int filter_level;
 
@@ -92,10 +98,15 @@
                     MODE_INFO *mi_8x8, const int mode_info_stride,
                     LOOP_FILTER_MASK *lfm);
 
-void vp9_filter_block_plane(struct VP9Common *const cm,
-                            struct macroblockd_plane *const plane,
-                            int mi_row,
-                            LOOP_FILTER_MASK *lfm);
+void vp9_filter_block_plane_ss00(struct VP9Common *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm);
+
+void vp9_filter_block_plane_ss11(struct VP9Common *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm);
 
 void vp9_filter_block_plane_non420(struct VP9Common *cm,
                                    struct macroblockd_plane *plane,
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -13,6 +13,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_loopfilter.h"
 
 #if CONFIG_MULTITHREAD
 static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
@@ -92,10 +93,17 @@
                              int start, int stop, int y_only,
                              VP9LfSync *const lf_sync) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  const int use_420 = y_only || (planes[1].subsampling_y == 1 &&
-                                 planes[1].subsampling_x == 1);
   const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
   int mi_row, mi_col;
+  enum lf_path path;
+  if (y_only)
+    path = LF_PATH_444;
+  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+    path = LF_PATH_420;
+  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+    path = LF_PATH_444;
+  else
+    path = LF_PATH_SLOW;
 
   for (mi_row = start; mi_row < stop;
        mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
@@ -112,16 +120,23 @@
       vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
       // TODO(JBB): Make setup_mask work for non 420.
-      if (use_420)
-        vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
-                       &lfm);
+      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+                     &lfm);
 
-      for (plane = 0; plane < num_planes; ++plane) {
-        if (use_420)
-          vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
-        else
-          vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
-                                        mi_row, mi_col);
+      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+      for (plane = 1; plane < num_planes; ++plane) {
+        switch (path) {
+          case LF_PATH_420:
+            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_444:
+            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_SLOW:
+            vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+            break;
+        }
       }
 
       sync_write(lf_sync, r, c, sb_cols);